Load main libs that will be used during the assignment
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(scales))
suppressMessages(library(xlsx))
suppressMessages(library(tidyr))
suppressMessages(library(lubridate))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))
# a) Load the 'diamonds' data set in R Studio.
# How many observations are in the data set?
nrow(diamonds)
## [1] 53940
#b) How many variables are in the data set?
ncol(diamonds)
## [1] 10
# c) How many ordered factors are in the set?
#str(diamonds)
# 3
# d) What letter represents the best color for a diamonds?
levels(diamonds$color)
## [1] "D" "E" "F" "G" "H" "I" "J"
# Create a histogram of the price of
# all the diamonds in the diamond data set.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 500) +
scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Price") + ylab("Count")
#Q3.3
# Describe the shape and center of the price distribution. Include summary statistics like the mean and median.
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
#
# The distribution is right-skewed with small amounts of very large prices driving up the mean, while the median remains a more robust measure of the center of the distribution.
# a) How many diamonds cost less than $500?
diamonds %>%
filter(price < 500) %>%
summarise(n = n())
## Source: local data frame [1 x 1]
##
## n
## (int)
## 1 1729
# b) How many diamonds cost less than $250?
diamonds %>%
filter(price < 250) %>%
summarise(n = n())
## Source: local data frame [1 x 1]
##
## n
## (int)
## 1 0
# a) How many diamonds cost less than $500?
diamonds %>%
filter(price >= 15000) %>%
summarise(n = n())
## Source: local data frame [1 x 1]
##
## n
## (int)
## 1 1656
# Explore the largest peak in the
# price histogram you created earlier.
# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) +
scale_x_continuous(labels = dollar, breaks = seq(300, 2000, 100)) +
theme(axis.text.x = element_text(angle = 45)) +
coord_cartesian(c(0,2000)) +
xlab("Price") + ylab("Count")
# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) +
scale_x_continuous(labels = dollar, breaks = seq(300, 4000, 100)) +
theme(axis.text.x = element_text(angle = 45)) +
coord_cartesian(c(300,4000)) +
facet_grid(cut~.) +
xlab("Price") + ylab("Count")
# a) Which cut has the highest priced diamond?
# Premium
by(diamonds$price, diamonds$cut, max)
## diamonds$cut: Fair
## [1] 18574
## --------------------------------------------------------
## diamonds$cut: Good
## [1] 18788
## --------------------------------------------------------
## diamonds$cut: Very Good
## [1] 18818
## --------------------------------------------------------
## diamonds$cut: Premium
## [1] 18823
## --------------------------------------------------------
## diamonds$cut: Ideal
## [1] 18806
# In the two last exercises, we looked at
# the distribution for diamonds by cut.
# Run the code below in R Studio to generate
# the histogram as a reminder.
# ===============================================================
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.
# Adjust the bin width and transform the scale
# of the x-axis using log10.
# Submit your final code when you are ready.
# ENTER YOUR CODE BELOW THIS LINE.
ggplot(diamonds, aes(x = price/carat)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = .05) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_log10(expression(paste(Log[10], " of Price")),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
facet_grid(cut~., scale = "free") + ylab("Count")
#Q3.10
# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
diamonds %>%
group_by(color) %>%
summarise(count = n(),
avg_price = mean(price))
## Source: local data frame [7 x 3]
##
## color count avg_price
## (fctr) (int) (dbl)
## 1 D 6775 3169.954
## 2 E 9797 3076.752
## 3 F 9542 3724.886
## 4 G 11292 3999.136
## 5 H 8304 4486.669
## 6 I 5422 5091.875
## 7 J 2808 5323.818
# There are many more Ideal diamonds than others, but the average price is also the lowest.
ggplot(diamonds, aes(x = clarity, y = price, color = color)) +
geom_boxplot() +
facet_grid(cut~., margins = TRUE)
# This boxplot matrix shows the distribution of price across all 3 categorical variables; cut, clarity, and color.
#To know colors order
# by(diamonds$price, diamonds$color, summary)
# a) What is the price range for the middle 50% of the diamonds with color D?
# c) What is the IQR for diamonds with the best color?
diamonds %>%
group_by(color) %>%
filter(color == "D") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
## Source: local data frame [1 x 4]
##
## color Quartile.25 Quartile.75 IQR
## (fctr) (dbl) (dbl) (dbl)
## 1 D 911 4213.5 3302.5
# b) What is the price range for the middle 50% of diamonds with color J?
# d) What is the IQR for the diamonds with the worst color?
diamonds %>%
group_by(color) %>%
filter(color == "J") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
## Source: local data frame [1 x 4]
##
## color Quartile.25 Quartile.75 IQR
## (fctr) (dbl) (dbl) (dbl)
## 1 J 1860.5 7695 5834.5
# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.
ggplot(diamonds, aes(x = color, y = price/carat, fill = color)) +
geom_boxplot() +
coord_cartesian(ylim=c(1000, 6000)) +
scale_y_continuous(labels=dollar) +
xlab("Color") + ylab("Price per Carat")
# Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.
# 0.3 and 1.01 in green
sizes = c(0.1, 0.3, 0.8, 1.01, 1.6, 2.0, 3.0, 5.0)
summary(diamonds$carat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2000 0.4000 0.7000 0.7979 1.0400 5.0100
ggplot(diamonds, aes(x=carat)) +
geom_freqpoly(binwidth=0.1, alpha = 0.75) +
scale_x_continuous(breaks=sizes, expand = c(0,0)) +
scale_y_continuous(expand=c(0,0))+
geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "red", linetype="dashed", alpha = 0.75) +
geom_vline(xintercept = c(0.3, 1.01), color = "forestgreen", linetype = "twodash") +
geom_hline(yintercept = 2000, color = "brown", linetype="longdash", alpha = 0.5) +
xlab("Carat Size") + ylab("Count")
#Q3.14
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.
# http://spreadsheets.google.com/pub?key=rdCufG2vozTpKw7TBGbyoWw&output=xls
hours <- tbl_df(read.xlsx("indicator_hours per week.xlsx", sheetName="Data", header=TRUE))
hours <- hours %>%
select(-NA.) %>% # Remove NA column carried over from xlsx
rename(Country = Working.hours.per.week) %>%
filter(Country != "<NA>") # Remove <NA> row carried over from xlsx
hours.long <- hours %>% gather("Year", "Hours",2:29)
hours.long <- hours.long %>%
mutate(Year = as.character(Year), # Convert to character
Year = substr(Year, 2, 5), # Slice out the X, leaving last 4 digits; R added X since initially since column names can't start with numbers.
Year = as.numeric(Year)) # Cast as numeric
yearStats <- hours.long %>%
group_by(Year) %>%
summarise(median = median(Hours, na.rm=TRUE),
mean = mean(Hours, na.rm=TRUE),
lower = min(Hours, na.rm=TRUE),
upper = max(Hours, na.rm=TRUE),
se = sd(Hours, na.rm=TRUE)/sqrt(length(Hours)),
avg_upper = mean + (2.101*se),
avg_lower = mean - (2.101*se),
quant.25 = quantile(Hours, na.rm=TRUE, 0.25),
quant.75 = quantile(Hours, na.rm=TRUE, 0.75))
yearStats <- round(yearStats, 2)
p <- ggplot(yearStats, aes(x=Year, y=median)) +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()) +
geom_linerange(yearStats, mapping=aes(x=Year, ymin=lower, ymax=upper), colour = "wheat2", alpha=1, size=5) +
geom_linerange(yearStats, mapping=aes(x=Year, ymin=quant.25, ymax=quant.75), colour = "wheat4", size=5) +
geom_line(yearStats, mapping=aes(x=Year, y=median, group=1)) +
geom_vline(xintercept = 1980, colour = "wheat4", linetype=1, size=1) +
geom_hline(yintercept=seq(26, 56, 2), color="white", linetype=1)
dottedYears <- seq(1980, 2007, 5) # Pick the years to draw dotted vertical lines on
p <- p + geom_vline(xintercept = dottedYears, color="wheat4", linetype=3, size=0.5)
p <- p + coord_cartesian(ylim = c(26,58))+
scale_y_continuous(breaks=seq(26, 60, 2)) +
scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) )
p <- p + geom_line(data = subset(hours.long, Country == "Egypt"), aes(x = Year, y = Hours, group = Country), color ="brown") +
annotate("segment", x=2000, xend=2002, y=35.5, yend=36, color="brown") +
annotate("text", x=2003, y=36.3, label="Egypt Hours", size=3.5, color="brown") +
annotate("segment", x=2000, xend=2001, y=33.5, yend=32) +
annotate("text", x=2002, y=31.7, label="World Medians", size=3.5)
p1 <- p + annotate("text", x=1999.9, y=56, label="Data represents hours worked per week for 52 countries ", size=3, color="gray30") +
annotate("text", x=2000, y=55, label="from 1980 to 2007. Outer lighter bands show the min/max ", size=3, color="gray30") +
annotate("text", x=2000, y=54, label="hours for each year, and inner darker bands show the IQR.", size=3, color="gray30") +
ggtitle("World's Working Hours") +
theme(plot.title=element_text(face="bold",hjust=.95,vjust=.8,color="#3C3C3C",size=20)) +
annotate("text", x=1994.6, y=57.5, label="Weekly", size=4, fontface="bold")
p1
# How many birthdays are in each month?
# Which day of the year has the most number of birthdays?
# Do you have at least 365 friends that have birthdays on everyday
# of the year?
# **********************************************************************
# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.
# You can expect to spend 30 min or more on this task depending on if
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.
# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends’ birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.
# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.
birthdays <- tbl_df(read.csv("birthdaysExample.csv"))
tempDates <- mdy(birthdays$dates)
birthdays <- birthdays %>%
mutate(Birthday = tempDates,
Year = year(tempDates),
Month = month(tempDates, label=TRUE, abbr=FALSE),
Day = day(tempDates),
Weekday = weekdays(tempDates, abbr=FALSE))
birthdays$Weekday <- factor(birthdays$Weekday, levels=c('Monday', 'Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'), ordered=TRUE)
# ifelse didn't seem to work here for creating 'optional' since the else would create a NULL vector within the function and R complained.
# creates optional ggplot argument only if Day is passed in.
# aes_string takes in a string as the column name for plotting. Perfect for our arguments we pass in as strings.
# Paste our argument in for custom X and Y-axis labeling
makePlots<- function(TimeLength){
optional<- NULL
if(TimeLength == "Day") optional<- scale_x_discrete(breaks = seq(1, 31, 1))
if(TimeLength == "Month") optional<- scale_x_discrete(breaks = seq(1, 12, 1))
if(TimeLength == "Month") optional<- scale_x_discrete(breaks = seq(1, 12, 1))
ggplot(birthdays, aes_string(x = TimeLength, fill = TimeLength)) +
geom_bar() +
scale_fill_brewer(palette="Paired") +
xlab(TimeLength)+
ylab(paste0("Number of Birthdays per ", TimeLength)) +
theme_tufte() +
optional
}
makePlots("Month")
makePlots("Day")
makePlots("Weekday")