Notes: Check working directory
getwd()
## [1] "C:/Dersler/Udacity_R/W3"
list.files()
## [1] "lesson3_student.html" "lesson3_student.rmd" "pseudo_facebook.tsv"
Notes:
pf <- read.csv("pseudo_facebook.tsv", sep = '\t')
dim(pf)
## [1] 99003 15
Notes:
#install.packages('ggplot2')
library(ggplot2)
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
qplot(x = dob_day, data = pf) +
scale_x_discrete(breaks=1:31)
Response: many people says they were born 1st of the month and less people are born 31st of the month
Notes: She checked the perception of users’ audience and the real number of audience
Notes: Maybe 25 which is 5%
Response: 25
Response: %10
Notes: 50
Notes: we now expand the graph and plotted dats of the month
qplot(x = dob_day, data = pf) +
scale_x_discrete(breaks=1:31) +
facet_wrap(~dob_month, ncol = 3)
Response: Many of the users were born in 1st of jan! ***
Notes:
Notes: She noticed that one user guessed s/he had 10 million audience
Response: a bad data an extreme outlier
Notes:
qplot(data =pf, x= friend_count)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Response:There is a guy with 5000 friends
Notes: we are trimming our graph and omitting the values above 1000
qplot(data = pf, x= friend_count, xlim = c(0,1000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# Another way to plot the same data
#qplot(data = pf, x= friend_count) +
# scale_x_continuous(limits = c(0,1000))
Notes: People tend to say numbers such as 10 20 100 but not 16 47 etc… when bin size is set to 1 this is more clear
Notes:
## What code would you add to create a facet the histogram by gender?
## Add it to the code below.
qplot(x = friend_count, data = pf, binwidth = 25) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
# In the alternate solution below, the period or dot in the formula for facet_grid()
# represents all of the other variables in the data set. Essentially, this notation splits
# up the data by gender and produces three histograms, each having their own row.
# qplot(x = friend_count, data = pf) +
# facet_grid(gender ~ .)
Notes:
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
# Other way of ommiting NA values is
# qplot(x = friend_count, data = na.omit(pf), binwidth = 10) +
# scale_x_continuous(limits = c(0, 1000),
# breaks = seq(0, 1000, 50)) +
# facet_wrap(~gender)
Notes:
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response:women
Response:22
Response: distribution is skewed, median would give better representation
Notes:
qplot(x = tenure, data = pf, binwidth = 30,
color = I('black'), fill = I('#099DD9'))
qplot(x = tenure/365, data = pf, binwidth = 0.25,
color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(1,7,1), limits = c(0,7))
Notes:
qplot(x = tenure/365, data = pf, binwidth = 0.25,
xlab = 'Number of years using Facebook',
ylab = 'Number of Users in sample' ,
color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(1,7,1), limits = c(0,7))
Notes: I noticed that there are no users between ages 0-13 due to facebook policy. There are some strange ages over 100 (I don’t think a person over 100 would even fancy a computer)
qplot(x = age, data = pf, binwidth = 2,
xlab = 'Facebook User ages',
ylab = 'Number of Users in sample' ,
color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(10,90,5), limits = c(10,90))
Response:
Notes:
Notes:
Notes:
library(gridExtra)
## Loading required package: grid
summary(pf$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
summary(log10(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -Inf 1 2 -Inf 2 4
summary(log10(pf$friend_count + 1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.090 14.350 70.160
p1 = qplot(x = friend_count, data = pf)
p2 = qplot(x = log10(friend_count+1), data = pf)
p3 = qplot(x = sqrt(friend_count), data = pf)
grid.arrange(p1,p2,p3,ncol=1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
p1 <- ggplot(aes(x = friend_count), data = pf) + geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1,p2,p3,ncol=1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Notes:
logScale <- qplot(x = log10(friend_count + 1), data = pf)
countScale <- ggplot(aes(x = friend_count), data = pf) +
geom_histogram() +
scale_x_log10()
grid.arrange(logScale,countScale, ncol=2)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = friend_count, data =pf) + scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth =10) +
scale_x_continuous(lim = c(0,1000), breaks = seq(0,1000,100)) +
facet_wrap(~gender)
qplot(x = friend_count, y = ..count../sum(..count..),
data = subset(pf, !is.na(gender)),
xlab = 'Friend Count',
ylab = 'Proportion of Users with that friend count',
binwidth =10,
geom ='freqpoly', color = gender) +
scale_x_continuous(lim = c(0,1000), breaks = seq(0,1000,100))
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
Notes:
qplot(x = www_likes, data = subset(pf, !is.na(gender)),
geom ='freqpoly', color = gender) +
scale_x_continuous() +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
by(pf$www_likes,pf$gender,sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Notes:
qplot(x = friend_count, data = subset(pf, !is.na(gender)),
binwidth = 25) +
scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50)) +
facet_wrap(~gender)
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom ='boxplot')
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom ='boxplot', ylim = c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
# Alternatively instead of ylim
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom ='boxplot') +
scale_y_continuous(limits = c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
# when we use the methods two above we indeed removing some data points!!
# Correct way top do this is with cord_cartesian
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom ='boxplot') +
coord_cartesian(ylim = c(0,1000))
Notes:
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom ='boxplot') +
coord_cartesian(ylim = c(0,250))
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response: Women #### Write about some ways that you can verify your answer. Response: by using by command or plotting a box plot gives us median values
qplot(x = gender, y = friendships_initiated,
data = subset(pf, !is.na(gender)),
geom ='boxplot') +
coord_cartesian(ylim = c(0,150))
by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Response:
Notes:
summary(pf$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25110.0
summary(pf$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
mobile_check_in <- NA
pf$mobile_checked_in <- ifelse(pf$mobile_likes >0, 1,0)
pf$mobile_checked_in <- factor(pf$mobile_checked_in)
summary(pf$mobile_checked_in)[2]/(summary(pf$mobile_checked_in)[2] +summary(pf$mobile_checked_in)[1])
## 1
## 0.6459097
Response:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!