Notes:
getwd()
## [1] "C:/Users/amackay/Documents/R Scripts"
setwd("~/R Datasources")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.1
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.1
Notes: 1. No. of observations in the dataset: 53940 2. Number of variables in thedatset: 10 3. Number of Ordered factors in the dataset: 3 4. Whihc letter represents the best color: D
data("diamonds")
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
qplot(x = price, data = diamonds)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
#how many diamonds cost less that $500 == 1749
summary(diamonds$price < 501)
## Mode FALSE TRUE NA's
## logical 52191 1749 0
#how many diamonds cost less that $250 == 0
summary(diamonds$price < 251)
## Mode FALSE NA's
## logical 53940 0
#how many diamonds cost $15000 or more == 1656
summary(diamonds$price >= 15000)
## Mode FALSE TRUE NA's
## logical 52284 1656 0
#Explore the largest peak in the histo
qplot(x = price , data = diamonds,
binwidth = 1,
color = I('black'), fill = I('#099DD9'),
xlab = 'Price of Diamonds',
ylab = 'Count of Diamonds in sample') +
scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5))
#Break out the histogram of diamond prices by cut
qplot(x = price , data = diamonds,
binwidth = 1,
color = I('black'), fill = I('#099DD9'),
xlab = 'Price of Diamonds',
ylab = 'Count of Diamonds in sample') +
scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5)) +
facet_wrap(~cut)
# Which cut has the highest priced diamond?
# dont use summary as it does rounding
by(diamonds$price, diamonds$cut,max)
## diamonds$cut: Fair
## [1] 18574
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 18788
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 18818
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 18823
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 18806
#which cut has the lowest priced diamond?
by(diamonds$price, diamonds$cut,min)
## diamonds$cut: Fair
## [1] 337
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 327
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 336
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 326
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 326
#which cut has the lowest median price ?
by(diamonds$price, diamonds$cut,median)
## diamonds$cut: Fair
## [1] 3282
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 3050.5
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 2648
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 3185
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 1810
NOTES:
# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
by(diamonds$price, diamonds$cut,summary)
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 2050 3282 4359 5206 18570
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 1145 3050 3929 5028 18790
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 912 2648 3982 5373 18820
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1046 3185 4584 6296 18820
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 878 1810 3458 4678 18810
# The 'Fair' and 'Good' diamonds appear to have
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right
# Look up the documentation for facet_wrap in R Studio.
# T?facet_
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.
# Look at free scales here: http://www.cookbook-r.com/Graphs/Facets_(ggplot2)/
qplot(x = price , data = diamonds,
binwidth = 1,
color = I('black'), fill = I('#099DD9'),
xlab = 'Price of Diamonds',
ylab = 'Count of Diamonds in sample') +
scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5)) +
facet_wrap(~cut, scales = 'free_y')
***
names(diamonds)
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
diamonds$price_per_carat <- NA
diamonds$price_per_carat = diamonds$carat / diamonds$price
diamonds$price_per_carat = factor(diamonds$price_per_carat)
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z price_per_carat
## Min. : 0.000 Min. : 0.000 0.000444444444444444: 331
## 1st Qu.: 4.720 1st Qu.: 2.910 0.000555555555555556: 215
## Median : 5.710 Median : 3.530 0.000238095238095238: 160
## Mean : 5.735 Mean : 3.539 0.000495867768595041: 157
## 3rd Qu.: 6.540 3rd Qu.: 4.040 0.000476190476190476: 129
## Max. :58.900 Max. :31.800 0.00037037037037037 : 126
## (Other) :52822
#plot histo with free y axis
qplot(x = price/carat, data = diamonds) +
facet_wrap(~cut, scales = 'free_y')
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#transsform to log scale
qplot(x = price/carat, data = diamonds) +
facet_wrap(~cut, scales = 'free_y') +
scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#add some color and titles
qplot(x = price/carat, data = diamonds,
xlab = 'Price per Carat',
ylab = 'Number of diamonds in sample',
color = I('black'), fill = I('#099DD9')) +
facet_wrap(~cut, scales = 'free_y') +
scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
***
NOTES:
by(diamonds$price,diamonds$color,summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
#You can use the function IQR() to find the interquartile range. Pass it a subset of the diamonds data frame.
#For example...
#IQR(subset(diamonds, price <1000)$price)
#Remember subset returns a data frame so we need to use $price on the end to access that variable.
by(diamonds$price,diamonds$color,IQR)
## diamonds$color: D
## [1] 3302.5
## --------------------------------------------------------
## diamonds$color: E
## [1] 3121
## --------------------------------------------------------
## diamonds$color: F
## [1] 3886.25
## --------------------------------------------------------
## diamonds$color: G
## [1] 5117
## --------------------------------------------------------
## diamonds$color: H
## [1] 4996.25
## --------------------------------------------------------
## diamonds$color: I
## [1] 6081.25
## --------------------------------------------------------
## diamonds$color: J
## [1] 5834.5
qplot(x=color,y = price/carat,data = diamonds,
geom = "boxplot")
#limit the y-axis
qplot(x=color,y = price/carat,data = diamonds,
geom = "boxplot")+
coord_cartesian(ylim = c(1500,6500))
#check and validate with numerical summary
by(diamonds$price/diamonds$carat,diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1128 2455 3411 3953 4749 17830
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1078 2430 3254 3805 4508 14610
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1168 2587 3494 4135 4947 13860
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1139 2538 3490 4163 5500 12460
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1051 2397 3819 4008 5127 10190
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1152 2345 3780 3996 5197 9398
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1081 2563 3780 3826 4928 8647
NOTES: Determin if these carat sizes have a count greated than 2000 0.1,0.3,0.8,1.01,1.6,2.0,3.0,5.0
qplot(x = carat,
data = diamonds,
geom = 'freqpoly',
xlab = 'Carat',
ylab = 'Count of diamonds in the sample'
) +
scale_x_continuous(breaks = seq(0,6,0.1)) +
scale_y_continuous(breaks = seq(0,9000,1000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
***