Lesson 3
Reading in Data
getwd()
## [1] "D:/R/Udacity/EDA_Course_Materials/lesson3"
list.files()
## [1] "indicator_female 20-39 percen - Data.csv"
## [2] "indicator_male 20-39 percen - Data.csv"
## [3] "L3.R"
## [4] "L3.rmd"
## [5] "L3_files"
## [6] "lesson3_student.rmd"
## [7] "Population_20_39.rmd"
## [8] "priceHistogram.png"
## [9] "priceHistogrambyclarity.png"
## [10] "priceHistogrambycolor.png"
## [11] "pseudo_facebook.tsv"
## [12] "RPubs - Population aged 20-39 years, both sexes (%).html"
## [13] "RPubs - Population aged 20-39 years, both sexes (%)_files"
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
library(ggplot2)
library(ggthemes)
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
#theme_set(theme_minimal(24))
#qplot(x = dob_day, data = pf)
ggplot(aes(x = dob_day), data = pf) +
geom_histogram() +
scale_x_discrete(breaks = 1:31) +
facet_wrap(~dob_month, ncol=3)

Count friends
qplot(x = friend_count, data = pf, xlim = c(0,1000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# another way to do a histogram:
qplot(x = friend_count, data = na.omit(pf), binwidth = 10) +
scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)

Statistics ‘by’ Gender
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Tenure
qplot(x = tenure/365, data = pf, binwidth = .25,
xlab = 'Number of years using Facebook',
ylab = 'Number of users in sample',
color = I('black'), fill = I('#F79420')) +
scale_x_continuous(lim = c(0,7), breaks = seq(0, 7, 1))

Age
qplot(x=age, data = pf, binwidth = 5,
xlab = 'Age of Facebook users',
ylab = 'Number of users in sample',
color = I('black'), fill = I("#FF9999")) +
scale_x_continuous(lim=c(13,80), breaks = seq(13,80,10))

Transforming Data
Scaled data
library(gridExtra)
## Loading required package: grid
p1 = qplot(x = friend_count, data = pf)
p2 = qplot(x = log10(friend_count +1), data = pf)
p3 = qplot(x = sqrt(friend_count), data = pf)
# arrange plots in grid
grid.arrange(p1, p2, p3, ncol = 3)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Frequency Polygons (before we had histograms)
qplot(x = friend_count, data = subset(pf, !is.na(gender)), biwidth = 10) +
scale_x_continuous(lim = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = friend_count, y = ..count../sum(..count..),
data = subset(pf, !is.na(gender)),
xlab = "Friend Count",
ylab = "Proportion of Users with that friend count",
biwidth = 10, geom = "freqpoly", color = gender) +
scale_x_continuous(lim = c(0, 1000), breaks = seq(0, 1000, 50))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).

Frequency Polygons (who has more likes)
qplot(x = www_likes,
data = subset(pf, !is.na(gender)),
geom = "freqpoly", color = gender) +
scale_x_continuous() +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Statistics of likes ‘by’ Gender
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Box Plots
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0,250))

by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Box Plots, Quartiles, Friend Requests
qplot(x = gender, y = friendships_initiated,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0,130))

by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Getting logical
summary(pf$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25110.0
summary(pf$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
## 0 1
## 35056 63947
Price Histogram
library(ggplot2)
qplot(price, data = diamonds,
xlab = 'Price of diamond',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999"))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
Cheaper diamonds
library(ggplot2)
qplot(price, data = diamonds,
xlab = 'Price of diamond',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999"),
binwidth = 1) +
scale_x_continuous(lim = c(0, 3000), breaks = seq(0, 3000, 250))

ggsave('priceHistogram.png')
## Saving 7 x 5 in image
#There are no diamonds that cost $1500.
#For diamonds that cost less than $2,000, the most common price of a diamond is around $700 with the mode being $605 (binwidth = 1).
Price by Cut Histogram
library(ggplot2)
a<-qplot(price, data = diamonds,
xlab = 'Price of diamond',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999")) +
facet_wrap(~cut, ncol = 2)
by(diamonds$price,diamonds$cut,max)
## diamonds$cut: Fair
## [1] 18574
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 18788
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 18818
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 18823
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 18806
by(diamonds$price,diamonds$cut,min)
## diamonds$cut: Fair
## [1] 337
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 327
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 336
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 326
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 326
by(diamonds$price,diamonds$cut,median)
## diamonds$cut: Fair
## [1] 3282
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 3050.5
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 2648
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 3185
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 1810
Scales and Multiple Histograms
qplot(x = price, data = diamonds,
xlab = 'Price of diamond',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999")) +
facet_wrap(~cut, scales = "free_y")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

by(diamonds$price, diamonds$cut, summary)
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 2050 3282 4359 5206 18570
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 1145 3050 3929 5028 18790
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 912 2648 3982 5373 18820
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1046 3185 4584 6296 18820
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 878 1810 3458 4678 18810
Price per Carat by Cut
qplot(x = price/carat, data = diamonds,
xlab = 'Price per carat',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999"),
binwidth = 50) +
facet_wrap(~cut)

qplot(x = price/carat, data = diamonds,
xlab = 'Price per carat',
ylab = 'Number of diamonds in sample',
color = I('red'), fill = I("#FF9999"),
binwidth = 1) +
facet_wrap(~cut) +
scale_x_log10()

Price Box Plots by clarity
qplot(x = clarity, y = price,
data = subset(diamonds, !is.na(clarity)),
geom = 'boxplot')

by(diamonds$price, diamonds$clarity, summary)
## diamonds$clarity: I1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 345 2080 3344 3924 5161 18530
## --------------------------------------------------------
## diamonds$clarity: SI2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 2264 4072 5063 5777 18800
## --------------------------------------------------------
## diamonds$clarity: SI1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1089 2822 3996 5250 18820
## --------------------------------------------------------
## diamonds$clarity: VS2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 900 2054 3925 6024 18820
## --------------------------------------------------------
## diamonds$clarity: VS1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 876 2005 3839 6023 18800
## --------------------------------------------------------
## diamonds$clarity: VVS2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336.0 794.2 1311.0 3284.0 3638.0 18770.0
## --------------------------------------------------------
## diamonds$clarity: VVS1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 816 1093 2523 2379 18780
## --------------------------------------------------------
## diamonds$clarity: IF
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 369 895 1080 2865 2388 18810
ggsave('priceHistogrambyclarity.png')
## Saving 7 x 5 in image
Price Box Plots by color
qplot(x = color, y = price,
data = subset(diamonds, !is.na(color)),
geom = 'boxplot')

by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
IQR(subset(diamonds, color == "D")$price)
## [1] 3302.5
IQR(subset(diamonds, color == "J")$price)
## [1] 5834.5
ggsave('priceHistogrambycolor.png')
## Saving 7 x 5 in image
Carat Frequency Polygon
qplot(carat, data = diamonds, geom = "freqpoly", binwidth = 0.01)
