Notes:
library(ggplot2)
data(diamonds)
# ?diamonds
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Notes:
# ?diamonds
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Notes:
qplot(x=price, data = diamonds, binwidth = 300)
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
# price
# Min. : 326
# 1st Qu.: 950
# Median : 2401
# Mean : 3933
# 3rd Qu.: 5324
# Max. : 18823
Notes:
# How Many daimonds cost less than $500?
# How Many daimonds cost less than $250?
# How Many daimonds cost more than $15,000?
# < $500
inexpensive <- subset(diamonds, price < 500)
nrow(inexpensive)
## [1] 1729
# < $250
cheap <- subset(diamonds, price < 250)
nrow(cheap)
## [1] 0
# > $ 15,000
expensive <- subset(diamonds, price >= 15000)
nrow(expensive)
## [1] 1656
Notes:
# Explore the largest peak in the
# price histogram you created earlier.
# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.
# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
# qplot(x = price, data = diamonds)
# ggsave('priceHistogram.png')
# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
# Submit your final code when you are ready.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================================
# Inspiration:
# qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
# scale_x_continuous(limits = c(0, 1000),
# breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
# This is the histogram created earlier with a high peak at around $700 on the X-Axis
qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 15, xlab = 'Price', ylab = 'Frequency ') +
scale_x_continuous(limits = c(0, 1500), breaks = seq(0, 1500, 100)) # +
## Warning: Removed 33930 rows containing non-finite values (stat_bin).
# facet_wrap(~cut) + facet_wrap(~color) + facet_wrap(~clarity)
# Experiments:
qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 1, xlab = 'Price', ylab = 'Frequency ') +
scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))
## Warning: Removed 33930 rows containing non-finite values (stat_bin).
midRange <- subset(diamonds, price < 1500)
nrow(midRange)
## [1] 20010
summary(midRange)
## carat cut color clarity depth
## Min. :0.2000 Fair : 266 D:2848 VS2 :4514 Min. :51.00
## 1st Qu.:0.3100 Good :1548 E:4360 SI1 :4279 1st Qu.:61.10
## Median :0.3400 Very Good:4186 F:3617 VS1 :3196 Median :61.80
## Mean :0.3677 Premium :4492 G:4351 VVS2 :2595 Mean :61.73
## 3rd Qu.:0.4100 Ideal :9518 H:2683 VVS1 :2360 3rd Qu.:62.40
## Max. :1.0300 I:1543 SI2 :1741 Max. :78.20
## J: 608 (Other):1325
## table price x y
## Min. :44.00 Min. : 326.0 Min. :3.730 Min. :3.68
## 1st Qu.:56.00 1st Qu.: 630.0 1st Qu.:4.350 1st Qu.:4.35
## Median :57.00 Median : 805.0 Median :4.500 Median :4.51
## Mean :57.01 Mean : 841.4 Mean :4.581 Mean :4.59
## 3rd Qu.:58.00 3rd Qu.:1024.0 3rd Qu.:4.770 3rd Qu.:4.78
## Max. :70.00 Max. :1454.0 Max. :6.650 Max. :6.09
##
## z
## Min. :2.06
## 1st Qu.:2.69
## Median :2.77
## Mean :2.83
## 3rd Qu.:2.95
## Max. :4.44
##
Notes:
# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================
qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 10, xlab = 'Price', ylab = 'Frequency') +
facet_wrap(~cut)
Notes:
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
fair <- subset(diamonds, cut == 'Fair')
# fair
summary(fair)
## carat cut color clarity depth
## Min. :0.220 Fair :1610 D:163 SI2 :466 Min. :43.00
## 1st Qu.:0.700 Good : 0 E:224 SI1 :408 1st Qu.:64.40
## Median :1.000 Very Good: 0 F:312 VS2 :261 Median :65.00
## Mean :1.046 Premium : 0 G:314 I1 :210 Mean :64.04
## 3rd Qu.:1.200 Ideal : 0 H:303 VS1 :170 3rd Qu.:65.90
## Max. :5.010 I:175 VVS2 : 69 Max. :79.00
## J:119 (Other): 26
## table price x y
## Min. :49.00 Min. : 337 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 2050 1st Qu.: 5.630 1st Qu.: 5.570
## Median :58.00 Median : 3282 Median : 6.175 Median : 6.100
## Mean :59.05 Mean : 4359 Mean : 6.247 Mean : 6.183
## 3rd Qu.:61.00 3rd Qu.: 5206 3rd Qu.: 6.700 3rd Qu.: 6.640
## Max. :95.00 Max. :18574 Max. :10.740 Max. :10.540
##
## z
## Min. :0.000
## 1st Qu.:3.610
## Median :3.970
## Mean :3.983
## 3rd Qu.:4.280
## Max. :6.980
##
good <- subset(diamonds, cut == 'Good')
# good
summary(good)
## carat cut color clarity depth
## Min. :0.2300 Fair : 0 D:662 SI1 :1560 Min. :54.30
## 1st Qu.:0.5000 Good :4906 E:933 SI2 :1081 1st Qu.:61.30
## Median :0.8200 Very Good: 0 F:909 VS2 : 978 Median :63.40
## Mean :0.8492 Premium : 0 G:871 VS1 : 648 Mean :62.37
## 3rd Qu.:1.0100 Ideal : 0 H:702 VVS2 : 286 3rd Qu.:63.80
## Max. :3.0100 I:522 VVS1 : 186 Max. :67.00
## J:307 (Other): 167
## table price x y
## Min. :51.00 Min. : 327 Min. :0.000 Min. :0.000
## 1st Qu.:56.00 1st Qu.: 1145 1st Qu.:5.020 1st Qu.:5.020
## Median :58.00 Median : 3050 Median :5.980 Median :5.990
## Mean :58.69 Mean : 3929 Mean :5.839 Mean :5.851
## 3rd Qu.:61.00 3rd Qu.: 5028 3rd Qu.:6.420 3rd Qu.:6.440
## Max. :66.00 Max. :18788 Max. :9.440 Max. :9.380
##
## z
## Min. :0.00
## 1st Qu.:3.07
## Median :3.70
## Mean :3.64
## 3rd Qu.:4.03
## Max. :5.79
##
verygood <- subset(diamonds, cut == 'Very Good')
# verygood
summary(verygood)
## carat cut color clarity
## Min. :0.2000 Fair : 0 D:1513 SI1 :3240
## 1st Qu.:0.4100 Good : 0 E:2400 VS2 :2591
## Median :0.7100 Very Good:12082 F:2164 SI2 :2100
## Mean :0.8064 Premium : 0 G:2299 VS1 :1775
## 3rd Qu.:1.0200 Ideal : 0 H:1824 VVS2 :1235
## Max. :4.0000 I:1204 VVS1 : 789
## J: 678 (Other): 352
## depth table price x
## Min. :56.80 Min. :44.00 Min. : 336 Min. : 0.000
## 1st Qu.:60.90 1st Qu.:56.00 1st Qu.: 912 1st Qu.: 4.750
## Median :62.10 Median :58.00 Median : 2648 Median : 5.740
## Mean :61.82 Mean :57.96 Mean : 3982 Mean : 5.741
## 3rd Qu.:62.90 3rd Qu.:59.00 3rd Qu.: 5373 3rd Qu.: 6.470
## Max. :64.90 Max. :66.00 Max. :18818 Max. :10.010
##
## y z
## Min. :0.00 Min. : 0.00
## 1st Qu.:4.77 1st Qu.: 2.95
## Median :5.77 Median : 3.56
## Mean :5.77 Mean : 3.56
## 3rd Qu.:6.51 3rd Qu.: 4.02
## Max. :9.94 Max. :31.80
##
premium <- subset(diamonds, cut == 'Premium')
# premium
summary(premium)
## carat cut color clarity depth
## Min. :0.200 Fair : 0 D:1603 SI1 :3575 Min. :58.00
## 1st Qu.:0.410 Good : 0 E:2337 VS2 :3357 1st Qu.:60.50
## Median :0.860 Very Good: 0 F:2331 SI2 :2949 Median :61.40
## Mean :0.892 Premium :13791 G:2924 VS1 :1989 Mean :61.26
## 3rd Qu.:1.200 Ideal : 0 H:2360 VVS2 : 870 3rd Qu.:62.20
## Max. :4.010 I:1428 VVS1 : 616 Max. :63.00
## J: 808 (Other): 435
## table price x y
## Min. :51.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:58.00 1st Qu.: 1046 1st Qu.: 4.800 1st Qu.: 4.790
## Median :59.00 Median : 3185 Median : 6.110 Median : 6.060
## Mean :58.75 Mean : 4584 Mean : 5.974 Mean : 5.945
## 3rd Qu.:60.00 3rd Qu.: 6296 3rd Qu.: 6.800 3rd Qu.: 6.760
## Max. :62.00 Max. :18823 Max. :10.140 Max. :58.900
##
## z
## Min. :0.000
## 1st Qu.:2.940
## Median :3.720
## Mean :3.647
## 3rd Qu.:4.160
## Max. :8.060
##
ideal <- subset(diamonds, cut == 'Ideal')
# ideal
summary(ideal)
## carat cut color clarity
## Min. :0.2000 Fair : 0 D:2834 VS2 :5071
## 1st Qu.:0.3500 Good : 0 E:3903 SI1 :4282
## Median :0.5400 Very Good: 0 F:3826 VS1 :3589
## Mean :0.7028 Premium : 0 G:4884 VVS2 :2606
## 3rd Qu.:1.0100 Ideal :21551 H:3115 SI2 :2598
## Max. :3.5000 I:2093 VVS1 :2047
## J: 896 (Other):1358
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. :0.000
## 1st Qu.:61.30 1st Qu.:55.00 1st Qu.: 878 1st Qu.:4.540
## Median :61.80 Median :56.00 Median : 1810 Median :5.250
## Mean :61.71 Mean :55.95 Mean : 3458 Mean :5.507
## 3rd Qu.:62.20 3rd Qu.:57.00 3rd Qu.: 4678 3rd Qu.:6.440
## Max. :66.70 Max. :63.00 Max. :18806 Max. :9.650
##
## y z
## Min. : 0.000 Min. :0.000
## 1st Qu.: 4.550 1st Qu.:2.800
## Median : 5.260 Median :3.230
## Mean : 5.520 Mean :3.401
## 3rd Qu.: 6.445 3rd Qu.:3.980
## Max. :31.800 Max. :6.030
##
# FAIR GOOD VG PREMIUM IDEAL
# Highest: 18574 18788 18818 18823 18806
# Lowest Median: 3282 3050 2648 3185 1810
# Lowest: 337 327 336 326 326
Notes:
# In the two last exercises, we looked at
# the distribution for diamonds by cut.
# Run the code below in R Studio to generate
# the histogram as a reminder.
# ===============================================================
# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.
# diamonds$cut: Fair
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 337 2050 3282 4359 5206 18570
# ------------------------------------------------------------------------
# diamonds$cut: Good
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 327 1145 3050 3929 5028 18790
# ------------------------------------------------------------------------
# diamonds$cut: Very Good
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 336 912 2648 3982 5373 18820
# ------------------------------------------------------------------------
# diamonds$cut: Premium
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 326 1046 3185 4584 6296 18820
# ------------------------------------------------------------------------
# diamonds$cut: Ideal
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 326 878 1810 3458 4678 18810
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
# ===============================================================
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
# The 'Fair' and 'Good' diamonds appear to have
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right.
# Let's look in to this more.
# Look up the documentation for facet_wrap in R Studio.
# Then, scroll back up and add a parameter to facet_wrap so that
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.
qplot(x = price, data = diamonds) + facet_wrap(~cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Solution: Free Scales: http://www.cookbook-r.com/Graphs/Facets_(ggplot2)/
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.
# Adjust the bin width and transform the scale
# of the x-axis using log10.
# Submit your final code when you are ready.
# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================
# Inspiration:
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Solution:
ggplot( aes(x = price/carat), data = diamonds, binwidth = 10) + geom_histogram() + scale_x_log10() + facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Experiments
# qplot(x=price, data = subset(diamonds, !is.na(price)), binwidth = 10, xlab = 'Price', ylab = 'Frequency') +
# facet_wrap(~cut)
# qplot(x = price/carat, data = diamonds, binwidth = 200, xlab = 'Price/Carat', ylab = 'Frequency')
Notes:
# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# =================================================================
# INSPIRATION:
# A better method - using coord_cartesian Layer - adjusting for zoom into 250 count.
# qplot(x= gender, y = friend_count,
# data = subset(pf, !is.na(gender)),
# geom = 'boxplot') +
# coord_cartesian(ylim = c(0, 250))
#
# Get actual Numbers
# by(pf$friend_count, pf$gender, summary)
qplot(x = color, y = price, data = subset(diamonds, !is.na(price)),
geom = 'boxplot') + coord_cartesian(ylim = c(0, 8000))
# Get actual Numbers
by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
# diamonds$color: D
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 357 911 1838 3170 4214 18690
# ----------------------------------------------------------------------------------------
# diamonds$color: E
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 326 882 1739 3077 4003 18730
# ----------------------------------------------------------------------------------------
# diamonds$color: F
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 342 982 2344 3725 4868 18790
# ----------------------------------------------------------------------------------------
# diamonds$color: G
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 354 931 2242 3999 6048 18820
# ----------------------------------------------------------------------------------------
# diamonds$color: H
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 337 984 3460 4487 5980 18800
# ----------------------------------------------------------------------------------------
# diamonds$color: I
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 334 1120 3730 5092 7202 18820
# ----------------------------------------------------------------------------------------
# diamonds$color: J
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 335 1860 4234 5324 7695 18710
# ?diamonds
best_color <- subset(diamonds, color == 'D')
# best_color['price']
summary(best_color)
## carat cut color clarity depth
## Min. :0.2000 Fair : 163 D:6775 SI1 :2083 Min. :52.2
## 1st Qu.:0.3600 Good : 662 E: 0 VS2 :1697 1st Qu.:61.0
## Median :0.5300 Very Good:1513 F: 0 SI2 :1370 Median :61.8
## Mean :0.6578 Premium :1603 G: 0 VS1 : 705 Mean :61.7
## 3rd Qu.:0.9050 Ideal :2834 H: 0 VVS2 : 553 3rd Qu.:62.5
## Max. :3.4000 I: 0 VVS1 : 252 Max. :71.6
## J: 0 (Other): 115
## table price x y
## Min. :52.0 Min. : 357 Min. :0.000 Min. :0.000
## 1st Qu.:56.0 1st Qu.: 911 1st Qu.:4.590 1st Qu.:4.600
## Median :57.0 Median : 1838 Median :5.230 Median :5.240
## Mean :57.4 Mean : 3170 Mean :5.417 Mean :5.421
## 3rd Qu.:59.0 3rd Qu.: 4214 3rd Qu.:6.180 3rd Qu.:6.180
## Max. :73.0 Max. :18693 Max. :9.420 Max. :9.340
##
## z
## Min. :0.000
## 1st Qu.:2.820
## Median :3.220
## Mean :3.343
## 3rd Qu.:3.840
## Max. :6.270
##
# 4214 - 911 = 3303
worst_color <- subset(diamonds, color == 'J')
# worst_color['price']
summary(worst_color)
## carat cut color clarity depth
## Min. :0.230 Fair :119 D: 0 SI1 :750 Min. :43.00
## 1st Qu.:0.710 Good :307 E: 0 VS2 :731 1st Qu.:61.20
## Median :1.110 Very Good:678 F: 0 VS1 :542 Median :62.00
## Mean :1.162 Premium :808 G: 0 SI2 :479 Mean :61.89
## 3rd Qu.:1.520 Ideal :896 H: 0 VVS2 :131 3rd Qu.:62.70
## Max. :5.010 I: 0 VVS1 : 74 Max. :73.60
## J:2808 (Other):101
## table price x y
## Min. :51.60 Min. : 335 Min. : 3.930 Min. : 3.900
## 1st Qu.:56.00 1st Qu.: 1860 1st Qu.: 5.700 1st Qu.: 5.718
## Median :58.00 Median : 4234 Median : 6.640 Median : 6.630
## Mean :57.81 Mean : 5324 Mean : 6.519 Mean : 6.518
## 3rd Qu.:59.00 3rd Qu.: 7695 3rd Qu.: 7.380 3rd Qu.: 7.380
## Max. :68.00 Max. :18710 Max. :10.740 Max. :10.540
##
## z
## Min. :2.460
## 1st Qu.:3.530
## Median :4.110
## Mean :4.033
## 3rd Qu.:4.580
## Max. :6.980
##
# 7696 - 1860 = 5835
Notes:
# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.
# SUBMIT YOUR CODE BELOW THIS LINE
# ===================================================================
qplot(x = color, y = price, data = subset(diamonds, !is.na(price)),
geom = 'boxplot') + coord_cartesian(ylim = c(0, 8000)) +
xlab('Color') +
ylab('Price Per Carat')
# Get actual Numbers
by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
Notes:
# Investigate the weight of the diamonds (carat) using a frequency polygon.
# What carat size has greater than 2000?
# INSPIRATION:
# qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10, geom = 'freqpoly', color = gender) +
# scale_x_continuous(lim = c(0,1000), breaks = seq(0, 1000, 50))
qplot(x = carat, data = subset(diamonds, !is.na(carat)), binwidth = .001, geom = 'freqpoly', color = color) +
scale_x_continuous()
weight_ct <- subset(diamonds, carat == 0.3)
nrow(weight_ct)
## [1] 2604
# weight_ct <- subset(diamonds, carat == 0.3)
# > nrow(weight_ct)
# [1] 2604
weight_ct <- subset(diamonds, carat == 1.01)
nrow(weight_ct)
## [1] 2242
# > weight_ct <- subset(diamonds, carat == 1.01)
# > nrow(weight_ct)
# [1] 2242
Notes:
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.
# You might use a simple histogram, a boxplot split over a categorical variable,
# or a frequency polygon. The choice is yours!
# You can find a link to the Gapminder website in the Instructor Notes.
# http://www.gapminder.org/data/
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. any questions you answered, your observations, and summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# Read the CSV file
cpi_df <- read.csv("corruption_perception.csv", header=TRUE, row.names = 1, check.names = T)
cpi_df <- subset(cpi_df, select = -c(X.1, X.2, X.3))
str(cpi_df)
## 'data.frame': 180 obs. of 2 variables:
## $ X2008: num 9.4 9.3 9.2 9.2 9 8.9 8.9 8.7 8.7 8.7 ...
## $ X2009: num 9.3 9.3 9.2 9.3 8.7 9.2 8.8 8.5 8.9 8.7 ...
# Get Quick Summary
summary(cpi_df)
## X2008 X2009
## Min. :1.100 Min. :1.100
## 1st Qu.:2.500 1st Qu.:2.400
## Median :3.300 Median :3.300
## Mean :4.031 Mean :4.007
## 3rd Qu.:5.125 3rd Qu.:5.025
## Max. :9.400 Max. :9.300
## NA's :4
# Load the Plot Library
library(ggplot2)
qplot(x = X2008, data = cpi_df, binwidth = .1, color = I('blue'), fill = I('#F79420')) +
# scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) +
xlab('Year 2008') +
ylab('Corruption Index')
ggplot(data=cpi_df, aes(x=X2008, y=X2009, group=1)) +
geom_line()+
geom_point()
## Warning: Removed 4 rows containing missing values (geom_point).
qplot(x=X2008, y=X2009,
data = cpi_df,
geom = 'boxplot')
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
Notes:
# Your task is to investigate the distribution of your friends'
# birth months and days.
# Here some questions you could answer, and we hope you think of others.
# **********************************************************************
# How many people share your birthday? Do you know them?
# (Reserve time with them or save money to buy them a gift!)
# Which month contains the most number of birthdays?
# How many birthdays are in each month?
# Which day of the year has the most number of birthdays?
# Do you have at least 365 friends that have birthdays on everyday
# of the year?
# **********************************************************************
# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.
# You can expect to spend 30 min or more on this task depending if you
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.
# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends' birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.
# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.
# We've included some links in the Instructor Notes to help get you started.
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. any questions you answered, your observations, and summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# Copy and paste all of the code that you used for
# your investigation below the line. Submit it when you are ready.
# ===============================================================================
birthdays_df <- read.csv("birthdaysExample.csv", header=TRUE, row.names = 1, check.names = F)
str(birthdays_df)
## 'data.frame': 1033 obs. of 1 variable:
## $ dates: Factor w/ 348 levels "1/1/2014","1/10/2014",..: 78 258 322 219 131 241 33 46 287 331 ...
# ?strptime
# Print out the date column
dates <- birthdays_df['dates']
# dates
# Create a new DF using the seperate cmd
# new_date_df = separate(birthdays_df, dates, c("month", "day", "year"), sep="/")
# new_date_df
# Months
# qplot(x=month, data = new_date_df)
# Days
# qplot(x=day, data = new_date_df)
# Frequency for Month: March is most populated.
# count(new_date_df, 'month')
# Frequency: The 14th is the most populated.
# count(new_date_df, 'day')
# > count(new_date_df, 'month')
# month freq
# 1 1 89
# 2 10 89
# 3 11 87
# 4 12 72
# 5 2 79
# 6 3 98
# 7 4 81
# 8 5 72
# 9 6 93
# 10 7 86
# 11 8 91
# 12 9 96
Notes: