Problem Set 3

Load Diamonds data set Determine the characteristics of the data set and types of variables in it ??veiw #no help view(diamonds) #can't get view() to work and no documentation. levels(diamonds$color)

Descriptive statistics on price Create a histogram for the price of diamonds

hist(diamonds$price)

mean and median

summary(diamonds$price)

How many diamonds cost less than $500? 250? More than 15000?

nrow(subset(diamonds, price < 500))

nrow(subset(diamonds, price <= 250))

nrow(subset(diamonds, price >= 15000))

Now explore the histogram and attempt some transformations of the axis, bin widths and breaks

library(ggplot2)

qplot(price, data = diamonds, color = “red”, xlim == 15000) gets error: Error in xlim == 15000 : comparison (1) is possible only for atomic and list types Try different numbers:

qplot(price, data = diamonds, color = "red", xlim == 15001)
## Error: could not find function "qplot"
qplot(price, data = diamonds, color = "red", xlim == 16000)
## Error: could not find function "qplot"
qplot(price, data = diamonds, color = "red", xlim == 10000)
## Error: could not find function "qplot"
# Well, that was futile.
qplot(price, data = diamonds, color = "red", xlim = 15000)
## Error: could not find function "qplot"
# Single equal sign no help.  Ahh, it needs two numbers!
qplot(price, data = diamonds, color = "green", xlim = c(0, 15000))
## Error: could not find function "qplot"
# weird, color doesn't change.
qplot(price, data = diamonds, facets = . ~ price, xlim = c(0, 15000))
## Error: could not find function "qplot"

Trying to get facets, seems like it is taking forever. Stopped after a few minutes of getting binwidth defaulted messages.

qplot(price, data = diamonds, binwidth = 10, facets = binwidth ~ ., xlim = c(0, 
    15000))
## Error: could not find function "qplot"
# 4 figures arranged in 2 rows and 2 columns
attach(diamonds)
## Error: object 'diamonds' not found
par(mfcol = c(1, 2))
hist(price, binwidth = 10, main = "price in 20 categories")
## Error: object 'price' not found
hist(price, binwidth = 100, main = "price in 50 categories")
## Error: object 'price' not found

Gave me the two graphs but didn't vary the binwidth as specified turns out that 20 is just the default in this case. It appears that binwidth is not a graphical parameter.

help(layout)
par()  #get graphical parameters of the current device. Utterly unintelligible. 
## $xlog
## [1] FALSE
## 
## $ylog
## [1] FALSE
## 
## $adj
## [1] 0.5
## 
## $ann
## [1] TRUE
## 
## $ask
## [1] FALSE
## 
## $bg
## [1] "transparent"
## 
## $bty
## [1] "o"
## 
## $cex
## [1] 1
## 
## $cex.axis
## [1] 1
## 
## $cex.lab
## [1] 1
## 
## $cex.main
## [1] 1.2
## 
## $cex.sub
## [1] 1
## 
## $cin
## [1] 0.15 0.20
## 
## $col
## [1] "black"
## 
## $col.axis
## [1] "black"
## 
## $col.lab
## [1] "black"
## 
## $col.main
## [1] "black"
## 
## $col.sub
## [1] "black"
## 
## $cra
## [1] 10.8 14.4
## 
## $crt
## [1] 0
## 
## $csi
## [1] 0.2
## 
## $cxy
## [1] 0.02604 0.03876
## 
## $din
## [1] 7 7
## 
## $err
## [1] 0
## 
## $family
## [1] ""
## 
## $fg
## [1] "black"
## 
## $fig
## [1] 0 1 0 1
## 
## $fin
## [1] 7 7
## 
## $font
## [1] 1
## 
## $font.axis
## [1] 1
## 
## $font.lab
## [1] 1
## 
## $font.main
## [1] 2
## 
## $font.sub
## [1] 1
## 
## $lab
## [1] 5 5 7
## 
## $las
## [1] 0
## 
## $lend
## [1] "round"
## 
## $lheight
## [1] 1
## 
## $ljoin
## [1] "round"
## 
## $lmitre
## [1] 10
## 
## $lty
## [1] "solid"
## 
## $lwd
## [1] 1
## 
## $mai
## [1] 1.02 0.82 0.82 0.42
## 
## $mar
## [1] 5.1 4.1 4.1 2.1
## 
## $mex
## [1] 1
## 
## $mfcol
## [1] 1 1
## 
## $mfg
## [1] 1 1 1 1
## 
## $mfrow
## [1] 1 1
## 
## $mgp
## [1] 3 1 0
## 
## $mkh
## [1] 0.001
## 
## $new
## [1] FALSE
## 
## $oma
## [1] 0 0 0 0
## 
## $omd
## [1] 0 1 0 1
## 
## $omi
## [1] 0 0 0 0
## 
## $page
## [1] TRUE
## 
## $pch
## [1] 1
## 
## $pin
## [1] 5.76 5.16
## 
## $plt
## [1] 0.1171 0.9400 0.1457 0.8829
## 
## $ps
## [1] 12
## 
## $pty
## [1] "m"
## 
## $smo
## [1] 1
## 
## $srt
## [1] 0
## 
## $tck
## [1] NA
## 
## $tcl
## [1] -0.5
## 
## $usr
## [1] 0 1 0 1
## 
## $xaxp
## [1] 0 1 5
## 
## $xaxs
## [1] "r"
## 
## $xaxt
## [1] "s"
## 
## $xpd
## [1] FALSE
## 
## $yaxp
## [1] 0 1 5
## 
## $yaxs
## [1] "r"
## 
## $yaxt
## [1] "s"
## 
## $ylbias
## [1] 0.2

From cookbook for r: create the plots and store them.

library(ggplot2)
p100 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 100) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 100")
# theme(legend.position='none') # No legend (redundant in this graph)
p1  #print the graph
## Error: object 'p1' not found

p50 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 50) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 50")
# theme(legend.position='none') # No legend (redundant in this graph)

p20 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 20) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 20")
# theme(legend.position='none') # No legend (redundant in this graph)
p20  #not what I expected. I was thinking that binwidth was the number of bins

plot of chunk unnamed-chunk-4


p1000 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 1000) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 1000") + theme(legend.position = "none")  # No legend (redundant in this graph)  
p1000

plot of chunk unnamed-chunk-4


p1 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 1) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 1")
# theme(legend.position='none') # No legend (redundant in this graph)
p1

plot of chunk unnamed-chunk-4

multiplot(p1, p100, p1000, cols = 3)
## Error: could not find function "multiplot"
# couldn't find 'multiplot'
update.packages()  #that was pointless
## Error: trying to use CRAN without setting a mirror
multiplot(p1, p100, cols = 2)
## Error: could not find function "multiplot"

Oh, turns out that multiplot is just created by source code. Have saved it as multiplot.R in the EDA folder

library(grid)
multiplot(p1, p100, p1000, cols = 3)
## Error: could not find function "multiplot"
multiplot(p1, p100, p1000, rows = 3)
## Error: could not find function "multiplot"

Now reproduce ramesh-37's plot

(pRamesh_1 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(colour = "black", 
    binwidth = 1) + xlim) = c(325, 1450) + # facet_grid(Diet ~ .) +
ggtitle("histogram with binwidth 1")
## Error: non-numeric argument to binary operator
# theme(legend.position='none') didn't work.

Now I use qplot

pRamesh_1 <-
  qplot(price, data = diamonds, binwidth = 5,  xlim = c(326,1450))
pRamesh_1

plot of chunk unnamed-chunk-8

qplot(price, data = diamonds)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-8

p1

plot of chunk unnamed-chunk-8


pRamesh_5 <- 
  ggplot(data = diamonds, aes(x=price)) +
  geom_histogram(colour="black", binwidth = 5) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram with binwidth = 5") 

pRamesh_20 <- 
  ggplot(data = diamonds, aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram with binwidth = 20") 

pRamesh_50 <- 
  ggplot(data = diamonds, aes(x=price)) +
  geom_histogram(colour="black", binwidth = 50) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram with binwidth = 50") 

multiplot(pRamesh_5, pRamesh_20, pRamesh_50, cols = 1)
## Error: could not find function "multiplot"

Now we break out diamonds by cut

levels(cut) #thinks I am asking about the grass cutting data set
## NULL

names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
levels(diamonds$cut)
## [1] "Fair"      "Good"      "Very Good" "Premium"   "Ideal"

diamondsPriceCutFair <- 
  ggplot(data = subset(diamonds, cut == "Fair"), aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram cut Fair") 

diamondsPriceCutGood <- 
  ggplot(data = subset(diamonds, cut == "Good"), aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram cut Good") 

diamondsPriceCutVery <- 
  ggplot(data = subset(diamonds, cut == "Very Good"), aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram cut Very Good") 

diamondsPriceCutPremium <- 
  ggplot(data = subset(diamonds, cut == "Premium"), aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram cut Premium") 

diamondsPriceCutIdeal <- 
  ggplot(data = subset(diamonds, cut == "Ideal"), aes(x=price)) +
  geom_histogram(colour="black", binwidth = 20) +
  scale_x_continuous(limits = c(326,1450)) + #corrected syntax
  ggtitle("histogram cut Ideal") 

multiplot(diamondsPriceCutFair,diamondsPriceCutGood,diamondsPriceCutVery,diamondsPriceCutPremium, diamondsPriceCutIdeal, cols = 1)
## Error: could not find function "multiplot"
summary(subset(diamonds$price, cut == "Fair"))  #does not work
## Error: comparison (1) is possible only for atomic and list types
summary(price, data = subset(diamonds, cut == "Fair"))
## Error: object 'price' not found
summary(price, data = subset(diamonds, cut == "Good"))
## Error: object 'price' not found
summary(price, data = subset(diamonds, cut == "Very Good"))
## Error: object 'price' not found
summary(price, data = subset(diamonds, cut == "Premium"))
## Error: object 'price' not found
summary(price, data = subset(diamonds, cut == "Ideal"))
## Error: object 'price' not found
# gives same answer everytime.
max(price, data = subset(diamonds, cut == "Fair"))
## Error: object 'price' not found
diamondsFair <- subset(diamonds, cut == "Fair")
summary(diamondsFair$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3280    4360    5210   18600

diamondsFair <- subset(diamonds, cut == "Fair")
summary(diamondsFair$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3280    4360    5210   18600
max(diamondsFair$price)
## [1] 18574
min(diamondsFair$price)
## [1] 337

diamondsGood <- subset(diamonds, cut == "Good")
summary(diamondsGood$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327    1140    3050    3930    5030   18800
max(diamondsGood$price)
## [1] 18788
min(diamondsGood$price)
## [1] 327

diamondsVGood <- subset(diamonds, cut == "Very Good")
summary(diamondsVGood$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     912    2650    3980    5370   18800
max(diamondsVGood$price)
## [1] 18818
min(diamondsVGood$price)
## [1] 336

diamondsPremium <- subset(diamonds, cut == "Premium")
summary(diamondsPremium$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1050    3180    4580    6300   18800
max(diamondsPremium$price)
## [1] 18823
min(diamondsPremium$price)
## [1] 326

diamondsIdeal <- subset(diamonds, cut == "Ideal")
summary(diamondsIdeal$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     878    1810    3460    4680   18800
max(diamondsIdeal$price)
## [1] 18806
min(diamondsIdeal$price)
## [1] 326

qplot(x = price, data = diamonds) + facet_wrap(~cut)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-10

Produce the array of graphs with the y axes adjusted to the each level of cut.

qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free_y")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-11


logPlot <- ggplot(aes(scale_x_log10(x = price), diamonds)) + facet_wrap(~cut)
## Error: ggplot2 doesn't know how to deal with data of class uneval

qplot(x = log10(price), data = diamonds) + facet_wrap(~cut)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-11


logPlot <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 100) + 
    scale_x_log10()

logPlot

plot of chunk unnamed-chunk-11

Try it without the log10 thing

logPlot <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 1) + 
    scale_x_log10()
logPlot

plot of chunk unnamed-chunk-12


nonLogPlot_bin1 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 1) + 
    ggtitle("non-Log Plot, bin 1")
nonLogPlot_bin1

plot of chunk unnamed-chunk-12


nonLogPlot_bin10 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 10) + 
    ggtitle("nonLogPlot")
nonLogPlot_bin10

plot of chunk unnamed-chunk-12


nonLogPlot_bin100 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 100) + 
    ggtitle("nonLogPlot")
nonLogPlot_bin100

plot of chunk unnamed-chunk-12


nonLogPlot_bin1000 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 1000) + 
    ggtitle("nonLogPlot")
nonLogPlot_bin1000

plot of chunk unnamed-chunk-12


logPlot_bin0.01 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 0.01) + 
    scale_x_log10()
# xlim(0,18000)
logPlot_bin0.01
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk unnamed-chunk-12


logPlot_bin0.1 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 0.1) + 
    scale_x_log10()
logPlot_bin0.1

plot of chunk unnamed-chunk-12


logPlot_bin1 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 1) + 
    scale_x_log10() + ggtitle("Log Plot, bin = 1")
logPlot_bin1

plot of chunk unnamed-chunk-12


logPlot_bin10 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 10) + 
    scale_x_log10() + ggtitle("Log Plot, bin 10")
logPlot_bin10

plot of chunk unnamed-chunk-12


logPlot_bin100 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 100) + 
    scale_x_log10() + ggtitle("Log Plot, bin 100")
logPlot_bin100

plot of chunk unnamed-chunk-12


logPlot_bin1000 <- ggplot(data = diamonds, aes(x = price)) + geom_histogram(binwidth = 1000) + 
    scale_x_log10() + ggtitle("Log Plot, bin 1000")
logPlot_bin1000
## Error: 'from' cannot be NA, NaN or infinite

Moritz's code:

price_per_carat = diamonds$price/diamonds$carat
qplot(x = price_per_carat, data = diamonds, color = I("black"), fill = I("blue")) + 
    scale_x_log10() + facet_wrap(~cut, ncol = 1, scales = "free_y")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-13

Now change it to keep y scale constant

qplot(x = price_per_carat, data = diamonds) + geom_bar(fill = I("blue")) + scale_x_log10() + 
    facet_wrap(~cut, ncol = 1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-14

Plotting two density curves over each other:

ggplot(diamonds, aes(price_per_carat, fill = cut)) + geom_bar(pos = "dodge") + 
    scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-15


ggplot(diamonds, aes(price_per_carat, fill = cut)) + geom_density(alpha = 0.2) + 
    scale_x_log10()

plot of chunk unnamed-chunk-15

Or you can get two plots by using position = “identity”

ggplot(diamonds, aes(price, fill = cut)) + geom_histogram(alpha = 0.5, aes(y = ..density..), 
    position = "identity")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-16

Or use a density plot with alpha = 0.2

ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + scale_x_log10()

plot of chunk unnamed-chunk-17


ggplot(diamonds, aes(price_per_carat, fill = cut)) + geom_density(alpha = 0.2) + 
    scale_x_log10()

plot of chunk unnamed-chunk-17

Now use boxplots

ggplot(diamonds, aes(y = price/carat, x = cut)) + geom_boxplot() + scale_y_log10()

plot of chunk unnamed-chunk-18


summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800

scale_colour_brewer()
## discrete_scale(aesthetics = "colour", scale_name = "brewer", 
##     palette = brewer_pal(type, palette))

summary(diamonds$price, data = subset(diamonds$cut["Fair"]))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$cut == "Good"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$cut == "Very Good"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$cut == "Premium"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$cut == "Ideal"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800

summary(diamonds$price, data = subset(diamonds$color == "D"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "E"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "F"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "G"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "H"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "I"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800
summary(diamonds$price, data = subset(diamonds$color == "J"))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800

IQR(subset(diamonds, color == "J")$price)
## [1] 5834
by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1840    3170    4210   18700 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1740    3080    4000   18700 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2340    3720    4870   18800 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2240    4000    6050   18800 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4490    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5090    7200   18800 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4230    5320    7700   18700
`?`(`?`(IQR()))

ggplot(diamonds, aes(y = price/carat, x = color)) + geom_boxplot()

plot of chunk unnamed-chunk-18


ggplot(diamonds, aes(y = log10(price/carat), x = color)) + geom_boxplot()

plot of chunk unnamed-chunk-18


ggplot(diamonds, aes(y = sqrt(price/carat), x = color)) + geom_boxplot() + scale_y_log10()

plot of chunk unnamed-chunk-18



ggplot(diamonds, aes(y = price/carat, x = color)) + geom_boxplot()

plot of chunk unnamed-chunk-18


4.25^10
## [1] 1922602
summary(diamonds$price/diamonds$carat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1050    2480    3500    4010    4950   17800
summary(log10(diamonds$price/diamonds$carat))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.02    3.39    3.54    3.55    3.69    4.25
summary((log10(diamonds$price/diamonds$carat))^10)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   63500  203000  312000  373000  474000 1930000

ggplot(data = diamonds, aes(x = color, y = price/carat)) + geom_boxplot() + 
    scale_y_log10()

plot of chunk unnamed-chunk-18

Frequency Polygon

ggplot(diamonds, aes(x = carat)) + geom_freqpoly(binwidth = 0.1)

plot of chunk unnamed-chunk-19

hist(diamonds$carat)

plot of chunk unnamed-chunk-19

summary(diamonds$carat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.400   0.700   0.798   1.040   5.010
warnings()
## NULL
qplot(data = diamonds, x = carat, binwidth = 0.1, color = I("black"), fill = I("turquoise")) + 
    scale_x_continuous(limits = c(0, 6), breaks = seq(0, 6, 0.1)) + geom_abline(intercept = 2000, 
    slope = 0, color = "red", size = 1, lty = "dashed")

plot of chunk unnamed-chunk-19