Notes:
# install.packages('ggplot2')
library(ggplot2)
# install.packages('gridExtra')
library(gridExtra)
Notes:
list.files()
## [1] "arms_exports.csv" "birthdaysExample.csv"
## [3] "code-snippets.R" "demystifying.R"
## [5] "demystifyingR2_v3.html" "demystifyingR2_v3.Rmd"
## [7] "EDA_Course_Materials" "EDA_Course_Materials.zip"
## [9] "lesson3_student.html" "lesson3_student.rmd"
## [11] "NppToR-2.7.0.exe" "problem_set_three.R"
## [13] "pseudo_facebook.tsv" "R_EDU_Materials"
## [15] "reddit.csv" "reddit.R"
## [17] "rsconnect" "RStudio-0.99.896.exe"
## [19] "stateData.csv" "test.pdf"
## [21] "What_is_a_RMD_file.html" "What_is_a_RMD_file.Rmd"
pf <- read.csv('pseudo_facebook.tsv', sep= '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes:
# install.packages('ggplot2')
library(ggplot2)
qplot(x=dob_day, data = pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(x=dob_day, data = pf) + scale_x_continuous(breaks = 1:31)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)
Response:
Notes:
Notes:
Response:
Response:
Notes:
Notes:
qplot(x=dob_day, data = pf) + scale_x_continuous(breaks = 1:31) + facet_wrap(~dob_month)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response:
Notes:
Notes: #### Which case do you think applies to Moira’s outlier? Response:
Notes:
qplot(x=friend_count, data = pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Response:
Notes:
qplot(x=friend_count, data = pf, xlim= c(0, 1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
qplot(x=friend_count, data = pf) + scale_x_continuous(limits = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notes:
Notes:
# What code would you add to create a facet the histogram by gender?
# Add it to the code below.
qplot(x = friend_count, data = pf, binwidth = 10) + scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
qplot(x = friend_count, data = pf, binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# na.omit(pf) - takes out ALL of the NA values in the dataset.
Notes:
# na.omit(pf) - takes out ALL of the NA values in the dataset.
Notes:
table(pf$gender)
##
## female male
## 40254 58574
# by(variable, categorical variable, function)
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response:
Men
Response:
22
Response:
Because the FB data is LONG TAILED, the mean gets skewed because of the far right data points and isn’t as representative as would be helpful, thus the median is better. Median is a more robust measure as it marks the half way point in our data.
Notes:
qplot(x = tenure, data = pf, color = I('blue'), fill = I('#099DD9'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x = tenure, data = pf, binwidth = 30, color = I('blue'), fill = I('#099DD9'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
# qplot(x = tenure/365, data = pf, binwidth = .25, color = I('red'), fill = I('#099DD9'))
qplot(x = tenure/365, data = pf, binwidth = .25, color = I('black'), fill = I('#F79420'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x = tenure/365, data = pf, binwidth = .25, color = I('blue'), fill = I('#F79420')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes: Make sensible choices for scales and limits on each axis.
qplot(x = tenure/365, data = pf, binwidth = .25, color = I('blue'), fill = I('#F79420')) +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) +
xlab('Number of years using FB') +
ylab('Number of users in sample')
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(x = age, data = pf, binwidth = 1, color = I('blue'), fill = I('#5760AB')) +
scale_x_continuous(breaks = seq(13, 65, 1), limits = c(0, 65)) +
xlab('Age of FB Users') +
ylab('Number of users in sample')
## Warning: Removed 10950 rows containing non-finite values (stat_bin).
ggplot(aes(x = age), data = pf) +
geom_histogram(binwidth = 1, fill = '#5760AB') +
scale_x_continuous(breaks = seq(0, 113, 5))
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Response:
Notes:
Notes:
Notes:
Notes:
# Install
# install.packages('gridExtra')
library(gridExtra)
# QPlot Syntax
p1 <- qplot(x = friend_count, data = pf)
p2 <- qplot(x = log10(friend_count + 1), data = pf)
p3 <- qplot(x = sqrt(friend_count), data = pf)
grid.arrange(p1, p2, p3, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(pf$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
summary(log10(pf$friend_count + 1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.090 14.350 70.160
# GGPlot Syntax
p1 <- ggplot(aes(x = friend_count), data = pf) + geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Labels Axis in actual Friend Counts
logScale <- qplot(x = log10(friend_count), data = pf)
# Labels Axis in LOG Units
countScale <- ggplot(aes(x = friend_count), data = pf) + geom_histogram() + scale_x_log10()
grid.arrange(logScale, countScale, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
# Original Dual histograms
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) + scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# FreqPolygons Graphs - nice but doesn't answer basic question - which gender on avg has more friends?
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10, geom = 'freqpoly', color = gender) + scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
# Alter the count scale to be proportions instead of counts - which will make things more clear.
# Instead of raw counts on the Y Axis, we get proportions
qplot(x = friend_count, y= ..count../sum(..count..),
data = subset(pf, !is.na(gender)),
xlab = 'Friend Count',
ylab = 'Proportion of Users with that Friend Count',
binwidth = 10, geom = 'freqpoly', color = gender) +
scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
# use LIMITS or BREAKS to explore more.
Notes:
qplot(x = www_likes, data = subset(pf, !is.na(gender)),
xlab = 'WWW Likes Count',
ylab = 'Proportion of Users with that WWW Likes Count',
geom = 'freqpoly', color = gender) +
scale_x_continuous() + scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
summary(pf$www_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 49.96 7.00 14860.00
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Notes:
# Original Dual histograms
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) + scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
# Revised to use Box Plots:
# Original Dual histograms
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot')
# MORE - Revised to use Box Plots:
# Original Dual histograms
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot', ylim= c(0, 1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
# Another way to do it is to use the scale_y_continuously function - still has inherent problems due to removal of data from consideration.
# NOTE: Using ylim or scale_y_continuous, we actually remove data from consideration
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') + scale_y_continuous(limits = c(0, 1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
# A better method which doesn't remove data is to use the coord_cartesian Layer
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 1000)) # NOTE: coord_cartesian allows our box plots to match summary data.
# Using YLIM would not have allowed them to match up.
# A better method - using coord_cartesian Layer - adjusting for zoom into 250 count.
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 250))
# Get actual Numbers
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
# A better method - using coord_cartesian Layer - adjusting for zoom into 250 count.
qplot(x= gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 250))
# Get actual Numbers
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Notes:
qplot(x= gender, y = friendships_initiated,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 150))
# Get actual Numbers to check with a numerical summary
by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Response: #### Write about some ways that you can verify your answer. Response:
qplot(x= gender, y = friendships_initiated,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 150))
# Get actual Numbers to check with a numerical summary
by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Response:
Notes:
# Output Flawed since lots of Zero's for those who've never used a feature.
summary(pf$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25110.0
# Converts to categorical data (T/F) for better analysis since both now get counted.
summary(pf$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
# Better still to create a new variable that tracks mobile checkins.
pf$mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0, 1, 0) # 1 if user has ever used it, 0 if they never have.
pf$mobile_check_in <- factor(pf$mobile_check_in) # Convert it to a factor variable.
summary(pf$mobile_check_in)
## 0 1
## 35056 63947
#Ratio: What percent of users check in using mobile? Do this programatically.
#
sum(pf$mobile_check_in ==1) / length(pf$mobile_check_in)
## [1] 0.6459097
Response:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!