Problem Set Lesson 3 - Explore One Variable

Load the Diamonds data set

Notes:

library(ggplot2)
data(diamonds)
# ?diamonds
str(diamonds)

## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

How Many observations are in the dataset? How many variables? How many ordered factors?

Notes:

# ?diamonds
str(diamonds)

## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Create a histogram of the price of all the diamonds in the data set

Notes:

qplot(x=price, data = diamonds, binwidth = 300)

summary(diamonds)

##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
##

# price
# Min.   :  326
# 1st Qu.:  950
# Median : 2401
# Mean   : 3933
# 3rd Qu.: 5324
# Max.   : 18823

Diamond Counts

Notes:

# How Many daimonds cost less than $500?
# How Many daimonds cost less than $250?
# How Many daimonds cost more than $15,000?

# < $500
inexpensive <- subset(diamonds, price < 500)
nrow(inexpensive)

## [1] 1729

# < $250
cheap <- subset(diamonds, price < 250)
nrow(cheap)

## [1] 0

# > $ 15,000
expensive <- subset(diamonds, price >= 15000)
nrow(expensive)

## [1] 1656

Explore Cheaper Diamonds

Notes:

# Explore the largest peak in the
# price histogram you created earlier.
# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.
# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
#                  qplot(x = price, data = diamonds)
#                  ggsave('priceHistogram.png')
# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
# Submit your final code when you are ready.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================================

# Inspiration:
# qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
#  scale_x_continuous(limits = c(0, 1000),
#  breaks = seq(0, 1000, 50)) + facet_wrap(~gender)

# This is the histogram created earlier with a high peak at around $700 on the X-Axis
qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 15, xlab = 'Price', ylab = 'Frequency ') +  
  scale_x_continuous(limits = c(0, 1500), breaks = seq(0, 1500, 100))   #  +

## Warning: Removed 33930 rows containing non-finite values (stat_bin).

#  facet_wrap(~cut) + facet_wrap(~color) +  facet_wrap(~clarity)

# Experiments:
qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 1, xlab = 'Price', ylab = 'Frequency ') +  
  scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))

## Warning: Removed 33930 rows containing non-finite values (stat_bin).

midRange <- subset(diamonds, price < 1500)
nrow(midRange)

## [1] 20010

summary(midRange)

##      carat               cut       color       clarity         depth      
##  Min.   :0.2000   Fair     : 266   D:2848   VS2    :4514   Min.   :51.00  
##  1st Qu.:0.3100   Good     :1548   E:4360   SI1    :4279   1st Qu.:61.10  
##  Median :0.3400   Very Good:4186   F:3617   VS1    :3196   Median :61.80  
##  Mean   :0.3677   Premium  :4492   G:4351   VVS2   :2595   Mean   :61.73  
##  3rd Qu.:0.4100   Ideal    :9518   H:2683   VVS1   :2360   3rd Qu.:62.40  
##  Max.   :1.0300                    I:1543   SI2    :1741   Max.   :78.20  
##                                    J: 608   (Other):1325                  
##      table           price              x               y       
##  Min.   :44.00   Min.   : 326.0   Min.   :3.730   Min.   :3.68  
##  1st Qu.:56.00   1st Qu.: 630.0   1st Qu.:4.350   1st Qu.:4.35  
##  Median :57.00   Median : 805.0   Median :4.500   Median :4.51  
##  Mean   :57.01   Mean   : 841.4   Mean   :4.581   Mean   :4.59  
##  3rd Qu.:58.00   3rd Qu.:1024.0   3rd Qu.:4.770   3rd Qu.:4.78  
##  Max.   :70.00   Max.   :1454.0   Max.   :6.650   Max.   :6.09  
##                                                                 
##        z       
##  Min.   :2.06  
##  1st Qu.:2.69  
##  Median :2.77  
##  Mean   :2.83  
##  3rd Qu.:2.95  
##  Max.   :4.44  
##

Price by Cut - Histograms

Notes:

# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================

qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 10, xlab = 'Price', ylab = 'Frequency') +  
  facet_wrap(~cut)

Price by Cut

Notes:

summary(diamonds)

##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
##

fair <- subset(diamonds, cut == 'Fair')
# fair
summary(fair)

##      carat              cut       color      clarity        depth      
##  Min.   :0.220   Fair     :1610   D:163   SI2    :466   Min.   :43.00  
##  1st Qu.:0.700   Good     :   0   E:224   SI1    :408   1st Qu.:64.40  
##  Median :1.000   Very Good:   0   F:312   VS2    :261   Median :65.00  
##  Mean   :1.046   Premium  :   0   G:314   I1     :210   Mean   :64.04  
##  3rd Qu.:1.200   Ideal    :   0   H:303   VS1    :170   3rd Qu.:65.90  
##  Max.   :5.010                    I:175   VVS2   : 69   Max.   :79.00  
##                                   J:119   (Other): 26                  
##      table           price             x                y         
##  Min.   :49.00   Min.   :  337   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.: 2050   1st Qu.: 5.630   1st Qu.: 5.570  
##  Median :58.00   Median : 3282   Median : 6.175   Median : 6.100  
##  Mean   :59.05   Mean   : 4359   Mean   : 6.247   Mean   : 6.183  
##  3rd Qu.:61.00   3rd Qu.: 5206   3rd Qu.: 6.700   3rd Qu.: 6.640  
##  Max.   :95.00   Max.   :18574   Max.   :10.740   Max.   :10.540  
##                                                                   
##        z        
##  Min.   :0.000  
##  1st Qu.:3.610  
##  Median :3.970  
##  Mean   :3.983  
##  3rd Qu.:4.280  
##  Max.   :6.980  
##

good <- subset(diamonds, cut == 'Good')
# good
summary(good)

##      carat               cut       color      clarity         depth      
##  Min.   :0.2300   Fair     :   0   D:662   SI1    :1560   Min.   :54.30  
##  1st Qu.:0.5000   Good     :4906   E:933   SI2    :1081   1st Qu.:61.30  
##  Median :0.8200   Very Good:   0   F:909   VS2    : 978   Median :63.40  
##  Mean   :0.8492   Premium  :   0   G:871   VS1    : 648   Mean   :62.37  
##  3rd Qu.:1.0100   Ideal    :   0   H:702   VVS2   : 286   3rd Qu.:63.80  
##  Max.   :3.0100                    I:522   VVS1   : 186   Max.   :67.00  
##                                    J:307   (Other): 167                  
##      table           price             x               y        
##  Min.   :51.00   Min.   :  327   Min.   :0.000   Min.   :0.000  
##  1st Qu.:56.00   1st Qu.: 1145   1st Qu.:5.020   1st Qu.:5.020  
##  Median :58.00   Median : 3050   Median :5.980   Median :5.990  
##  Mean   :58.69   Mean   : 3929   Mean   :5.839   Mean   :5.851  
##  3rd Qu.:61.00   3rd Qu.: 5028   3rd Qu.:6.420   3rd Qu.:6.440  
##  Max.   :66.00   Max.   :18788   Max.   :9.440   Max.   :9.380  
##                                                                 
##        z       
##  Min.   :0.00  
##  1st Qu.:3.07  
##  Median :3.70  
##  Mean   :3.64  
##  3rd Qu.:4.03  
##  Max.   :5.79  
##

verygood <- subset(diamonds, cut == 'Very Good')
# verygood
summary(verygood)

##      carat               cut        color       clarity    
##  Min.   :0.2000   Fair     :    0   D:1513   SI1    :3240  
##  1st Qu.:0.4100   Good     :    0   E:2400   VS2    :2591  
##  Median :0.7100   Very Good:12082   F:2164   SI2    :2100  
##  Mean   :0.8064   Premium  :    0   G:2299   VS1    :1775  
##  3rd Qu.:1.0200   Ideal    :    0   H:1824   VVS2   :1235  
##  Max.   :4.0000                     I:1204   VVS1   : 789  
##                                     J: 678   (Other): 352  
##      depth           table           price             x         
##  Min.   :56.80   Min.   :44.00   Min.   :  336   Min.   : 0.000  
##  1st Qu.:60.90   1st Qu.:56.00   1st Qu.:  912   1st Qu.: 4.750  
##  Median :62.10   Median :58.00   Median : 2648   Median : 5.740  
##  Mean   :61.82   Mean   :57.96   Mean   : 3982   Mean   : 5.741  
##  3rd Qu.:62.90   3rd Qu.:59.00   3rd Qu.: 5373   3rd Qu.: 6.470  
##  Max.   :64.90   Max.   :66.00   Max.   :18818   Max.   :10.010  
##                                                                  
##        y              z        
##  Min.   :0.00   Min.   : 0.00  
##  1st Qu.:4.77   1st Qu.: 2.95  
##  Median :5.77   Median : 3.56  
##  Mean   :5.77   Mean   : 3.56  
##  3rd Qu.:6.51   3rd Qu.: 4.02  
##  Max.   :9.94   Max.   :31.80  
##

premium <- subset(diamonds, cut == 'Premium')
# premium
summary(premium)

##      carat              cut        color       clarity         depth      
##  Min.   :0.200   Fair     :    0   D:1603   SI1    :3575   Min.   :58.00  
##  1st Qu.:0.410   Good     :    0   E:2337   VS2    :3357   1st Qu.:60.50  
##  Median :0.860   Very Good:    0   F:2331   SI2    :2949   Median :61.40  
##  Mean   :0.892   Premium  :13791   G:2924   VS1    :1989   Mean   :61.26  
##  3rd Qu.:1.200   Ideal    :    0   H:2360   VVS2   : 870   3rd Qu.:62.20  
##  Max.   :4.010                     I:1428   VVS1   : 616   Max.   :63.00  
##                                    J: 808   (Other): 435                  
##      table           price             x                y         
##  Min.   :51.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:58.00   1st Qu.: 1046   1st Qu.: 4.800   1st Qu.: 4.790  
##  Median :59.00   Median : 3185   Median : 6.110   Median : 6.060  
##  Mean   :58.75   Mean   : 4584   Mean   : 5.974   Mean   : 5.945  
##  3rd Qu.:60.00   3rd Qu.: 6296   3rd Qu.: 6.800   3rd Qu.: 6.760  
##  Max.   :62.00   Max.   :18823   Max.   :10.140   Max.   :58.900  
##                                                                   
##        z        
##  Min.   :0.000  
##  1st Qu.:2.940  
##  Median :3.720  
##  Mean   :3.647  
##  3rd Qu.:4.160  
##  Max.   :8.060  
##

ideal <- subset(diamonds, cut == 'Ideal')
# ideal
summary(ideal)

##      carat               cut        color       clarity    
##  Min.   :0.2000   Fair     :    0   D:2834   VS2    :5071  
##  1st Qu.:0.3500   Good     :    0   E:3903   SI1    :4282  
##  Median :0.5400   Very Good:    0   F:3826   VS1    :3589  
##  Mean   :0.7028   Premium  :    0   G:4884   VVS2   :2606  
##  3rd Qu.:1.0100   Ideal    :21551   H:3115   SI2    :2598  
##  Max.   :3.5000                     I:2093   VVS1   :2047  
##                                     J: 896   (Other):1358  
##      depth           table           price             x        
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   :0.000  
##  1st Qu.:61.30   1st Qu.:55.00   1st Qu.:  878   1st Qu.:4.540  
##  Median :61.80   Median :56.00   Median : 1810   Median :5.250  
##  Mean   :61.71   Mean   :55.95   Mean   : 3458   Mean   :5.507  
##  3rd Qu.:62.20   3rd Qu.:57.00   3rd Qu.: 4678   3rd Qu.:6.440  
##  Max.   :66.70   Max.   :63.00   Max.   :18806   Max.   :9.650  
##                                                                 
##        y                z        
##  Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 4.550   1st Qu.:2.800  
##  Median : 5.260   Median :3.230  
##  Mean   : 5.520   Mean   :3.401  
##  3rd Qu.: 6.445   3rd Qu.:3.980  
##  Max.   :31.800   Max.   :6.030  
##

#                   FAIR     GOOD        VG     PREMIUM     IDEAL
# Highest:          18574    18788     18818      18823     18806
# Lowest Median:    3282      3050      2648       3185      1810
# Lowest:           337       327        336        326       326

Scales and Multiple Histograms

Notes:

# In the two last exercises, we looked at
# the distribution for diamonds by cut.
# Run the code below in R Studio to generate
# the histogram as a reminder.
# ===============================================================
# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.

# diamonds$cut: Fair
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#     337    2050    3282    4359    5206   18570
# ------------------------------------------------------------------------
# diamonds$cut: Good
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#     327    1145    3050    3929    5028   18790
# ------------------------------------------------------------------------
# diamonds$cut: Very Good
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#     336     912    2648    3982    5373   18820
# ------------------------------------------------------------------------
# diamonds$cut: Premium
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#     326    1046    3185    4584    6296   18820
# ------------------------------------------------------------------------
# diamonds$cut: Ideal
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#     326     878    1810    3458    4678   18810

# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
# ===============================================================
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.

# The 'Fair' and 'Good' diamonds appear to have
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right.
# Let's look in to this more.
# Look up the documentation for facet_wrap in R Studio.
# Then, scroll back up and add a parameter to facet_wrap so that
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.


qplot(x = price, data = diamonds) + facet_wrap(~cut)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Solution: Free Scales: http://www.cookbook-r.com/Graphs/Facets_(ggplot2)/
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales="free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Price per Carat by Cut

Notes:

# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.
# Adjust the bin width and transform the scale
# of the x-axis using log10.
# Submit your final code when you are ready.
# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================


# Inspiration:
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales="free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Solution:
ggplot( aes(x = price/carat), data = diamonds,  binwidth = 10) + geom_histogram() + scale_x_log10() + facet_wrap(~cut, scales="free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Experiments 
# qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 10, xlab = 'Price', ylab = 'Frequency') +  
#  facet_wrap(~cut)  

# qplot(x = price/carat, data = diamonds, binwidth = 200, xlab = 'Price/Carat', ylab = 'Frequency')

Price Box Plots

Notes:

# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# =================================================================

# INSPIRATION:
# A better method - using coord_cartesian Layer - adjusting for zoom into 250 count.
# qplot(x= gender, y = friend_count, 
#      data = subset(pf, !is.na(gender)), 
#      geom = 'boxplot') + 
#  coord_cartesian(ylim = c(0, 250))
#
# Get actual Numbers
# by(pf$friend_count, pf$gender, summary)

qplot(x = color, y = price, data = subset(diamonds, !is.na(price)),
      geom = 'boxplot') + coord_cartesian(ylim = c(0, 8000))

# Get actual Numbers
by(diamonds$price, diamonds$color, summary)

## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

# diamonds$color: D
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    357     911    1838    3170    4214   18690 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: E
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    326     882    1739    3077    4003   18730 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: F
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    342     982    2344    3725    4868   18790 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: G
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    354     931    2242    3999    6048   18820 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: H
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    337     984    3460    4487    5980   18800 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: I
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    334    1120    3730    5092    7202   18820 
# ---------------------------------------------------------------------------------------- 
# diamonds$color: J
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    335    1860    4234    5324    7695   18710

Inter Quartile Range (IQR) = Upper Quarter - Lower Quarter

# ?diamonds

best_color <- subset(diamonds, color == 'D')
# best_color['price']
summary(best_color)

##      carat               cut       color       clarity         depth     
##  Min.   :0.2000   Fair     : 163   D:6775   SI1    :2083   Min.   :52.2  
##  1st Qu.:0.3600   Good     : 662   E:   0   VS2    :1697   1st Qu.:61.0  
##  Median :0.5300   Very Good:1513   F:   0   SI2    :1370   Median :61.8  
##  Mean   :0.6578   Premium  :1603   G:   0   VS1    : 705   Mean   :61.7  
##  3rd Qu.:0.9050   Ideal    :2834   H:   0   VVS2   : 553   3rd Qu.:62.5  
##  Max.   :3.4000                    I:   0   VVS1   : 252   Max.   :71.6  
##                                    J:   0   (Other): 115                 
##      table          price             x               y        
##  Min.   :52.0   Min.   :  357   Min.   :0.000   Min.   :0.000  
##  1st Qu.:56.0   1st Qu.:  911   1st Qu.:4.590   1st Qu.:4.600  
##  Median :57.0   Median : 1838   Median :5.230   Median :5.240  
##  Mean   :57.4   Mean   : 3170   Mean   :5.417   Mean   :5.421  
##  3rd Qu.:59.0   3rd Qu.: 4214   3rd Qu.:6.180   3rd Qu.:6.180  
##  Max.   :73.0   Max.   :18693   Max.   :9.420   Max.   :9.340  
##                                                                
##        z        
##  Min.   :0.000  
##  1st Qu.:2.820  
##  Median :3.220  
##  Mean   :3.343  
##  3rd Qu.:3.840  
##  Max.   :6.270  
##

# 4214 - 911 = 3303

worst_color <- subset(diamonds, color == 'J')
# worst_color['price']
summary(worst_color)

##      carat              cut      color       clarity        depth      
##  Min.   :0.230   Fair     :119   D:   0   SI1    :750   Min.   :43.00  
##  1st Qu.:0.710   Good     :307   E:   0   VS2    :731   1st Qu.:61.20  
##  Median :1.110   Very Good:678   F:   0   VS1    :542   Median :62.00  
##  Mean   :1.162   Premium  :808   G:   0   SI2    :479   Mean   :61.89  
##  3rd Qu.:1.520   Ideal    :896   H:   0   VVS2   :131   3rd Qu.:62.70  
##  Max.   :5.010                   I:   0   VVS1   : 74   Max.   :73.60  
##                                  J:2808   (Other):101                  
##      table           price             x                y         
##  Min.   :51.60   Min.   :  335   Min.   : 3.930   Min.   : 3.900  
##  1st Qu.:56.00   1st Qu.: 1860   1st Qu.: 5.700   1st Qu.: 5.718  
##  Median :58.00   Median : 4234   Median : 6.640   Median : 6.630  
##  Mean   :57.81   Mean   : 5324   Mean   : 6.519   Mean   : 6.518  
##  3rd Qu.:59.00   3rd Qu.: 7695   3rd Qu.: 7.380   3rd Qu.: 7.380  
##  Max.   :68.00   Max.   :18710   Max.   :10.740   Max.   :10.540  
##                                                                   
##        z        
##  Min.   :2.460  
##  1st Qu.:3.530  
##  Median :4.110  
##  Mean   :4.033  
##  3rd Qu.:4.580  
##  Max.   :6.980  
##

# 7696 - 1860 = 5835

Price Per Carrot Box Plots by Color

Notes:

# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.
# SUBMIT YOUR CODE BELOW THIS LINE
# ===================================================================


qplot(x = color, y = price, data = subset(diamonds, !is.na(price)),
      geom = 'boxplot') + coord_cartesian(ylim = c(0, 8000)) +
       xlab('Color') +
       ylab('Price Per Carat')

# Get actual Numbers
by(diamonds$price, diamonds$color, summary)

## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

Carrot Frequency Polygon

Notes:

# Investigate the weight of the diamonds (carat) using a frequency polygon. 
# What carat size has greater than 2000?

# INSPIRATION:
# qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10, geom = 'freqpoly', color = gender) + 
#   scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50))


qplot(x = carat, data = subset(diamonds, !is.na(carat)), binwidth = .001, geom = 'freqpoly', color = color) +
  scale_x_continuous()

weight_ct <- subset(diamonds, carat == 0.3)
nrow(weight_ct)

## [1] 2604

# weight_ct <- subset(diamonds, carat == 0.3)
# > nrow(weight_ct)
# [1] 2604

weight_ct <- subset(diamonds, carat == 1.01)
nrow(weight_ct)

## [1] 2242

# > weight_ct <- subset(diamonds, carat == 1.01)
# > nrow(weight_ct)
# [1] 2242

Gap Minder Data

Notes:

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.
# You might use a simple histogram, a boxplot split over a categorical variable,
# or a frequency polygon. The choice is yours!
# You can find a link to the Gapminder website in the Instructor Notes.
# http://www.gapminder.org/data/
# Once you've completed your investigation, create a post in the discussions that includes:
#       1. any questions you answered, your observations, and summary statistics
#       2. snippets of code that created the plots
#       3. links to the images of your plots

# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.

# Read the  CSV file
cpi_df <- read.csv("corruption_perception.csv", header=TRUE, row.names = 1, check.names = T)

cpi_df <- subset(cpi_df, select = -c(X.1, X.2, X.3))

str(cpi_df)

## 'data.frame':    180 obs. of  2 variables:
##  $ X2008: num  9.4 9.3 9.2 9.2 9 8.9 8.9 8.7 8.7 8.7 ...
##  $ X2009: num  9.3 9.3 9.2 9.3 8.7 9.2 8.8 8.5 8.9 8.7 ...

# Get Quick Summary
summary(cpi_df)

##      X2008           X2009      
##  Min.   :1.100   Min.   :1.100  
##  1st Qu.:2.500   1st Qu.:2.400  
##  Median :3.300   Median :3.300  
##  Mean   :4.031   Mean   :4.007  
##  3rd Qu.:5.125   3rd Qu.:5.025  
##  Max.   :9.400   Max.   :9.300  
##                  NA's   :4

# Load the Plot Library
library(ggplot2)

qplot(x = X2008, data = cpi_df, binwidth = .1, color = I('blue'), fill = I('#F79420')) +
  # scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) +
  xlab('Year 2008') +
  ylab('Corruption Index')

ggplot(data=cpi_df, aes(x=X2008, y=X2009, group=1)) +
  geom_line()+
  geom_point()

## Warning: Removed 4 rows containing missing values (geom_point).

qplot(x=X2008, y=X2009,
      data = cpi_df, 
      geom = 'boxplot')

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Exploring Your Friends Birthdays

Notes:

# Your task is to investigate the distribution of your friends'
# birth months and days.
# Here some questions you could answer, and we hope you think of others.
# **********************************************************************
# How many people share your birthday? Do you know them?
# (Reserve time with them or save money to buy them a gift!)
# Which month contains the most number of birthdays?
# How many birthdays are in each month?
# Which day of the year has the most number of birthdays?
# Do you have at least 365 friends that have birthdays on everyday
# of the year?
# **********************************************************************
# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.

# You can expect to spend 30 min or more on this task depending if you
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.

# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends' birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.

# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.

# We've included some links in the Instructor Notes to help get you started.

# Once you've completed your investigation, create a post in the discussions that includes:
#       1. any questions you answered, your observations, and summary statistics
#       2. snippets of code that created the plots
#       3. links to the images of your plots

# Copy and paste all of the code that you used for
# your investigation below the line. Submit it when you are ready.
# ===============================================================================

birthdays_df <- read.csv("birthdaysExample.csv", header=TRUE, row.names = 1, check.names = F)

str(birthdays_df)

## 'data.frame':    1033 obs. of  1 variable:
##  $ dates: Factor w/ 348 levels "1/1/2014","1/10/2014",..: 78 258 322 219 131 241 33 46 287 331 ...

# ?strptime


# Print out the date column
dates <- birthdays_df['dates']
# dates

# Create a new DF using the seperate cmd
# new_date_df = separate(birthdays_df, dates, c("month", "day", "year"), sep="/")
# new_date_df

# Months
# qplot(x=month, data = new_date_df)

# Days
# qplot(x=day, data = new_date_df)

# Frequency for Month: March is most populated.
# count(new_date_df, 'month')

# Frequency: The 14th is the most populated.
# count(new_date_df, 'day')



# > count(new_date_df, 'month')
#   month freq
# 1      1   89
# 2     10   89
# 3     11   87
# 4     12   72
# 5      2   79
# 6      3   98
# 7      4   81
# 8      5   72
# 9      6   93
# 10     7   86
# 11     8   91
# 12     9   96

TEMPLATE

Notes:

Problem Set Lesson 3 - Explore One Variable

Load the Diamonds data set

How Many observations are in the dataset? How many variables? How many ordered factors?

Create a histogram of the price of all the diamonds in the data set

Diamond Counts

Explore Cheaper Diamonds

Price by Cut - Histograms

Price by Cut

Scales and Multiple Histograms

Price per Carat by Cut

Price Box Plots

Inter Quartile Range (IQR) = Upper Quarter - Lower Quarter

Price Per Carrot Box Plots by Color

Carrot Frequency Polygon

Gap Minder Data

Exploring Your Friends Birthdays

TEMPLATE

The End !