Notes: Change directories and load the dataset
Notes: read in the TAB separated values from the dataset
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes:
install.packages('ggplot2', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz/src/contrib:
## cannot download all files
## Warning: package 'ggplot2' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(ggplot2)
qplot(data = pf, x = dob_day) +
scale_x_continuous(breaks=1:31) +
facet_wrap(~dob_month, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response: A lot people with birthday of Jan. 1. Likely because of people not changing the default or not entering their birthday because of privacy concerns. It is hard to read because of the Jan. 1 spike.
Notes: Moira investigated the number of views one’s post got vs. how many views they thought they got.
Notes: On average, people estimated they got a fourth of the views they actually got.
Notes:
qplot(data = pf, x = dob_day) +
scale_x_continuous(breaks=1:31) +
facet_wrap(~dob_month, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response: Jan. 1 is an outlier
Notes: Someone registered that they thought 1,000,000 friends saw their post when there is a friend max at 5,000
qplot(data = subset(pf, !is.na(gender)), x = friend_count, binwidth = 25) +
facet_wrap(~gender, ncol = 2)
***
Response: It is hard to read
Notes:
qplot(data = subset(pf, !is.na(gender)), x = friend_count, binwidth = 25) +
scale_x_continuous(limits = c(0,1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender, ncol = 2)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# What code would you add to create a facet the histogram by gender?
# Add it to the code below.
qplot(x = friend_count, data = pf, binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50))
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notes:
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
Notes:
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response: Women
Response: 22
Response: Because it is not affected by outliers
Notes:
qplot(x = tenure/365, data = pf, binwidth = .5)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x = tenure/365, data = pf, binwidth = .5,
color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(x = tenure/365, data = pf, binwidth = .5,
xlab = 'Number of years using Facebook',
ylab = 'Number of users',
color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(x = age, data = pf, binwidth = 1,
xlab = 'Years old',
ylab = 'Number of Users',
color = I('black'), fill = I('orange')) +
scale_x_continuous(breaks = seq(1, 113, 5), limits = c(0,113))
Response: There are some spikes at ages over 100 which is unlikely
Notes:
install.packages('gridExtra', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz/src/contrib:
## cannot download all files
## Warning: package 'gridExtra' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(gridExtra)
p1 <- ggplot(aes(x = friend_count), data = pf) + geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes: (Another way to code it)
p1 <- qplot(x = friend_count, data = pf)
p2 <- qplot(x = friend_count, data = pf,
xlab = 'friend_count log10') +
scale_x_log10()
p3 <- qplot(x = friend_count, data = pf,
xlab = 'friend_count sqrt') +
scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(x = friend_count,
y = ..count../sum(..count..),
data = subset(pf, !is.na(gender)),
xlab = 'Friend Count',
ylab = 'Proportion of Users with that Friend Count',
binwidth = 10,
geom = 'freqpoly',
color = gender) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notes:
qplot(x = www_likes,
data = subset(pf, !is.na(gender)),
xlab = 'Likes',
ylab = 'Number',
geom = 'freqpoly',
color = gender) +
scale_x_continuous() +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Notes:
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot')
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot', ylim = c(1, 1000))
## Warning: Removed 4911 rows containing non-finite values (stat_boxplot).
#qplot(x = gender, y = friend_count,
#data = subset(pf, !is.na(gender)),
#geom = 'boxplot') +
#scale_y_continuous(limits = c(0, 1000))
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 250))
Notes:
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 250))
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response: Women
Response: Using the by function
by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
qplot(x= gender, y = friendships_initiated,
data = subset(pf, !is.na(gender)), geom = 'boxplot') +
coord_cartesian(ylim = c(0, 125))
summary(pf$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0, 'Yes', 'No')
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
## No Yes
## 35056 63947
sum(pf$mobile_check_in == 'Yes')/length(pf$mobile_check_in)
## [1] 0.6459097
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!