Notes: Download pseudo_facebook.tsv
Notes:
getwd()
## [1] "D:/Documents/5. Dev/1. udacity/1. data_analysis_w_R/EDA_Course_Materials/lesson3"
list.files()
## [1] "lesson3_student.rmd" "lesson3_student_files" "problem_set3.Rmd"
## [4] "pseudo_facebook.tsv"
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes:
install.packages('ggplot2', repos ="http://cran.us.r-project.org")
## Installing package into 'C:/Users/hahnsang/Documents/R/win-library/3.2'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\hahnsang\AppData\Local\Temp\1\RtmpQf6870\downloaded_packages
library(ggplot2)
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
qplot(x = dob_day, data = pf) +
scale_x_continuous(breaks=1:31)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response: the first day peaks high and then the other days are flat
Notes:
Notes:
Response:
Response:
Notes:
Notes: facet_wrap(formula) facet_wrap(variable) facet_grid(formula) facet_grid(verticalhorizontal)
qplot(x = dob_day, data = pf) +
scale_x_continuous(breaks=1:31) +
facet_wrap(~dob_month, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response: January is unusual
Notes: Outliers may be an accurate data of a very extreme case
Notes: #### Which case do you think applies to Moiraâs outlier? Response: bad data about an extreme case
Notes:
qplot(data = pf, x = friend_count)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response: quite similar, long tail data
Notes: for a long-tailed data
qplot(data = pf, x = friend_count, xlim=c(0, 1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
qplot(data = pf, x = friend_count) +
scale_x_continuous(limits= c(0, 1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notes:
Notes:
# What code would you add to create a facet the histogram by gender?
# Add it to the code below.
qplot(x = friend_count, data = pf, binwidth = 25) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notes:
qplot(x = friend_count, data = subset(pf,!is.na(gender)), binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
qplot(x = friend_count, data = na.omit(pf), binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
Notes: We still don’t know which is more than the other. So, use tabel
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response: woman
Response: 22
Response: more robost
Notes:
qplot(x = tenure, data = pf, binwidth=30, color = I('black'), fill = I('#099DD9'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x = tenure/365, data = pf, binwidth=.25, color = I('black'), fill = I('#F79420')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes: x axis and y axis are automatically generated unless you specify them
qplot(x = tenure/365, data = pf,
xlab = 'Number of years using Facebook',
ylab = 'Number of users in sample',
color = I('black'), fill = I('#F79420')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(x = age, data = pf, binwidth = 1,
xlab = 'Ages', ylab = 'Number of users in sample',
color = I('black'), fill = I('#5760AB')) +
scale_x_continuous(breaks = seq(0, 113, 5))
Response: Age starts from 13, and is a peak around 100
Notes:
Notes:
Notes: Log Transformations of Data http://www.r-statistics.com/2013/05/log-transformations-for-skewed-and-wide-distributions-from-practical-data-science-with-r/
qplot(x = friend_count, data = pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(pf$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
summary(log10(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -Inf 1 2 -Inf 2 4
summary(log10(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.090 14.350 70.160
install.packages('gridExtra', repos ="http://cran.us.r-project.org")
## Installing package into 'C:/Users/hahnsang/Documents/R/win-library/3.2'
## (as 'lib' is unspecified)
## package 'gridExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\hahnsang\AppData\Local\Temp\1\RtmpQf6870\downloaded_packages
library(gridExtra)
p1 <- qplot(x = friend_count, data = pf)
p2 <- qplot(x = log10(pf$friend_count+1), data = pf)
p3 <- qplot(x = sqrt(pf$friend_count), data = pf)
grid.arrange(p1, p2, p3, ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- ggplot(aes(x = friend_count), data = pf) + geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
***
Notes: Only difference is actual count on x axis
logScale <- qplot(log10(pf$friend_count), data = pf)
countScale <- ggplot(aes(x = friend_count), data = pf) + geom_histogram() + scale_x_log10()
grid.arrange(logScale, countScale, ncol =2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
Question: which one shows better Who has more friends in average than women?
qplot(x = friend_count, data = subset(pf,!is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
qplot(x = friend_count, data = subset(pf,!is.na(gender)),
binwidth = 10, geom='freqpoly', color = gender) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
qplot(x = friend_count, y= ..count../sum(..count..), data = subset(pf,!is.na(gender)),
xlab = 'Friend Count', ylab = 'Proportion of Users with that friend count',
binwidth = 10, geom='freqpoly', color = gender) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notes: Use a frequency poloygon to determine which gender makes more likes on the world wide web. What’s the www_like count for males? Which gender has more www_likes?
qplot(x = www_likes, data = subset(pf, !is.na(gender)),
geom ='freqpoly', color = gender) +
scale_x_continuous() +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
ggplot(aes(x = www_likes), data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender)) +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Notes: check outliers How to read boxplot http://flowingdata.com/2008/02/15/how-to-read-and-use-a-box-and-whisker-plot/ Interquartile range or IQR https://en.wikipedia.org/wiki/Interquartile_range Visualization https://en.wikipedia.org/wiki/File:Boxplot_vs_PDF.svg
qplot(x = gender, y= friend_count,
data = subset(pf,!is.na(gender)),
geom = 'boxplot')
qplot(x = gender, y= friend_count,
data = subset(pf,!is.na(gender)),
geom = 'boxplot') +
scale_y_continuous(limits = c(0, 1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
Notes:
qplot(x = gender, y= friend_count,
data = subset(pf,!is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 250))
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response: female #### Write about some ways that you can verify your answer. Response:
qplot(x = gender, y= friendships_initiated,
data = subset(pf,!is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 500))
qplot(x = gender, y= friendships_initiated,
data = subset(pf,!is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 150))
by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Response: it’s helpful to understand a distribution of the data, we can see the middle 50% of values for each segment of our categorial variable. Our polots also let us get a sense of outliers. in one way they’re much more rich with information than just this table
Notes: how to handle lots of zero values
summary(pf$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25110.0
summary(pf$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0, 1, 0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
## 0 1
## 35056 63947
sum(pf$mobile_check_in == 1)/length(pf$mobile_check_in)
## [1] 0.6459097
What percent of check in using mobile? Response: 65%
Reflection: I learned the visualization of data including histograms, frequency polygon wih scaling layers, and box plots. Box plots are useful for ruling out outliers. Also, learned about logical operation. Concept map: https://wiki.uiowa.edu/download/attachments/42009071/Concept_Map.gif?version=1&modificationDate=1287007903090&api=v2
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!