Lesson 3

Reading in Data

getwd()

## [1] "D:/R/Udacity/EDA_Course_Materials/lesson3"

list.files()

##  [1] "indicator_female 20-39 percen - Data.csv"                 
##  [2] "indicator_male 20-39 percen - Data.csv"                   
##  [3] "L3.R"                                                     
##  [4] "L3.rmd"                                                   
##  [5] "L3_files"                                                 
##  [6] "lesson3_student.rmd"                                      
##  [7] "Population_20_39.rmd"                                     
##  [8] "priceHistogram.png"                                       
##  [9] "priceHistogrambyclarity.png"                              
## [10] "priceHistogrambycolor.png"                                
## [11] "pseudo_facebook.tsv"                                      
## [12] "RPubs - Population aged 20-39 years, both sexes (%).html" 
## [13] "RPubs - Population aged 20-39 years, both sexes (%)_files"

pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
library(ggplot2)
library(ggthemes)
names(pf)

##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

#theme_set(theme_minimal(24))
#qplot(x = dob_day, data = pf)
ggplot(aes(x = dob_day), data = pf) + 
    geom_histogram() + 
    scale_x_discrete(breaks = 1:31) +
    facet_wrap(~dob_month, ncol=3)

Count friends

qplot(x = friend_count, data = pf, xlim = c(0,1000))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# another way to do a histogram:
qplot(x = friend_count, data = na.omit(pf), binwidth = 10) + 
    scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50)) +
    facet_wrap(~gender)

Statistics ‘by’ Gender

table(pf$gender)

## 
## female   male 
##  40254  58574

by(pf$friend_count, pf$gender, summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Tenure

qplot(x = tenure/365, data = pf, binwidth = .25,
      xlab = 'Number of years using Facebook',
      ylab = 'Number of users in sample',
      color = I('black'), fill = I('#F79420')) +
    scale_x_continuous(lim = c(0,7), breaks = seq(0, 7, 1))

Age

qplot(x=age, data = pf, binwidth = 5,
      xlab = 'Age of Facebook users',
      ylab = 'Number of users in sample',
      color = I('black'), fill = I("#FF9999")) +
    scale_x_continuous(lim=c(13,80), breaks = seq(13,80,10))

Transforming Data

Scaled data

library(gridExtra)

## Loading required package: grid

p1 = qplot(x = friend_count, data = pf)
p2 = qplot(x = log10(friend_count +1), data = pf)
p3 = qplot(x = sqrt(friend_count), data = pf)
# arrange plots in grid
grid.arrange(p1, p2, p3, ncol = 3)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Frequency Polygons (before we had histograms)

qplot(x = friend_count, data = subset(pf, !is.na(gender)), biwidth = 10) +
    scale_x_continuous(lim = c(0, 1000), breaks = seq(0, 1000, 50)) +
    facet_wrap(~gender)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = friend_count, y = ..count../sum(..count..),
      data = subset(pf, !is.na(gender)),
      xlab = "Friend Count",
      ylab = "Proportion of Users with that friend count",
      biwidth = 10, geom = "freqpoly", color = gender) +
    scale_x_continuous(lim = c(0, 1000), breaks = seq(0, 1000, 50))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

Frequency Polygons (who has more likes)

qplot(x = www_likes, 
      data = subset(pf, !is.na(gender)),
      geom = "freqpoly", color = gender) +
    scale_x_continuous() +
    scale_x_log10()

## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Statistics of likes ‘by’ Gender

by(pf$www_likes, pf$gender, sum)

## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

Box Plots

qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
    coord_cartesian(ylim = c(0,250))

by(pf$friend_count, pf$gender, summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Box Plots, Quartiles, Friend Requests

qplot(x = gender, y = friendships_initiated,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
    coord_cartesian(ylim = c(0,130))

by(pf$friendships_initiated, pf$gender, summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0

Getting logical

summary(pf$mobile_likes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     4.0   106.1    46.0 25110.0

summary(pf$mobile_likes > 0)

##    Mode   FALSE    TRUE    NA's 
## logical   35056   63947       0

mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)

##     0     1 
## 35056 63947

Price Histogram

library(ggplot2)
qplot(price, data = diamonds,
      xlab = 'Price of diamond',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999"))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

summary(diamonds$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

Cheaper diamonds

library(ggplot2)
qplot(price, data = diamonds,
      xlab = 'Price of diamond',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999"),
      binwidth = 1) +
  scale_x_continuous(lim = c(0, 3000), breaks = seq(0, 3000, 250))

ggsave('priceHistogram.png')

## Saving 7 x 5 in image

#There are no diamonds that cost $1500.
#For diamonds that cost less than $2,000, the most common price of a diamond is around $700 with the mode being $605 (binwidth = 1).

Price by Cut Histogram

library(ggplot2)
a<-qplot(price, data = diamonds,
      xlab = 'Price of diamond',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999")) +
      facet_wrap(~cut, ncol = 2)
by(diamonds$price,diamonds$cut,max)

## diamonds$cut: Fair
## [1] 18574
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 18788
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 18818
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 18823
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 18806

by(diamonds$price,diamonds$cut,min)

## diamonds$cut: Fair
## [1] 337
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 327
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 336
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 326
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 326

by(diamonds$price,diamonds$cut,median)

## diamonds$cut: Fair
## [1] 3282
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 3050.5
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 2648
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 3185
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 1810

Scales and Multiple Histograms

qplot(x = price, data = diamonds,
      xlab = 'Price of diamond',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999")) +
      facet_wrap(~cut, scales = "free_y")

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

by(diamonds$price, diamonds$cut, summary)

## diamonds$cut: Fair
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3282    4359    5206   18570 
## -------------------------------------------------------- 
## diamonds$cut: Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327    1145    3050    3929    5028   18790 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     912    2648    3982    5373   18820 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1046    3185    4584    6296   18820 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     878    1810    3458    4678   18810

Price per Carat by Cut

qplot(x = price/carat, data = diamonds,
      xlab = 'Price per carat',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999"),
      binwidth = 50) +
      facet_wrap(~cut)

qplot(x = price/carat, data = diamonds,
      xlab = 'Price per carat',
      ylab = 'Number of diamonds in sample',
      color = I('red'), fill = I("#FF9999"),
      binwidth = 1) +
      facet_wrap(~cut) +
      scale_x_log10()

Price Box Plots by clarity

qplot(x = clarity, y = price,
      data = subset(diamonds, !is.na(clarity)),
      geom = 'boxplot')

by(diamonds$price, diamonds$clarity, summary)

## diamonds$clarity: I1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     345    2080    3344    3924    5161   18530 
## -------------------------------------------------------- 
## diamonds$clarity: SI2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    2264    4072    5063    5777   18800 
## -------------------------------------------------------- 
## diamonds$clarity: SI1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1089    2822    3996    5250   18820 
## -------------------------------------------------------- 
## diamonds$clarity: VS2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334     900    2054    3925    6024   18820 
## -------------------------------------------------------- 
## diamonds$clarity: VS1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327     876    2005    3839    6023   18800 
## -------------------------------------------------------- 
## diamonds$clarity: VVS2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   336.0   794.2  1311.0  3284.0  3638.0 18770.0 
## -------------------------------------------------------- 
## diamonds$clarity: VVS1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     816    1093    2523    2379   18780 
## -------------------------------------------------------- 
## diamonds$clarity: IF
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     369     895    1080    2865    2388   18810

ggsave('priceHistogrambyclarity.png')

## Saving 7 x 5 in image

Price Box Plots by color

qplot(x = color, y = price,
      data = subset(diamonds, !is.na(color)),
      geom = 'boxplot')

by(diamonds$price, diamonds$color, summary)

## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

IQR(subset(diamonds, color == "D")$price)

## [1] 3302.5

IQR(subset(diamonds, color == "J")$price)

## [1] 5834.5

ggsave('priceHistogrambycolor.png')

## Saving 7 x 5 in image

Carat Frequency Polygon

qplot(carat, data = diamonds, geom = "freqpoly", binwidth = 0.01)