Load main libs that will be used during the assignment

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(scales))
suppressMessages(library(xlsx))
suppressMessages(library(tidyr))
suppressMessages(library(lubridate))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))

Q3.1

# a) Load the 'diamonds' data set in R Studio. 
# How many observations are in the data set?
nrow(diamonds)

## [1] 53940

#b) How many variables are in the data set?
ncol(diamonds)

## [1] 10

# c) How many ordered factors are in the set?

#str(diamonds)
# 3

# d) What letter represents the best color for a diamonds?

levels(diamonds$color)

## [1] "D" "E" "F" "G" "H" "I" "J"

Q3.2

# Create a histogram of the price of
# all the diamonds in the diamond data set.
ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 500) + 
  scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  xlab("Price") + ylab("Count")

#Q3.3

# Describe the shape and center of the price distribution. Include summary statistics like the mean and median.
summary(diamonds$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

# 
# The distribution is right-skewed with small amounts of very large prices driving up the mean, while the median remains a more robust measure of the center of the distribution.

Q3.4

# a) How many diamonds cost less than $500?
diamonds %>%
  filter(price < 500) %>%
  summarise(n = n())

## Source: local data frame [1 x 1]
## 
##       n
##   (int)
## 1  1729

# b) How many diamonds cost less than $250?
diamonds %>%
  filter(price < 250) %>%
  summarise(n = n())

## Source: local data frame [1 x 1]
## 
##       n
##   (int)
## 1     0

# a) How many diamonds cost less than $500?
diamonds %>%
  filter(price >= 15000) %>%
  summarise(n = n())

## Source: local data frame [1 x 1]
## 
##       n
##   (int)
## 1  1656

Q3.5

# Explore the largest peak in the
# price histogram you created earlier.

# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.

ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) + 
  scale_x_continuous(labels = dollar, breaks = seq(300, 2000, 100)) + 
  theme(axis.text.x = element_text(angle = 45)) + 
  coord_cartesian(c(0,2000)) +
  xlab("Price") + ylab("Count")

Q3.6

# Break out the histogram of diamond prices by cut.

# You should have five histograms in separate
# panels on your resulting plot.

ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) + 
  scale_x_continuous(labels = dollar, breaks = seq(300, 4000, 100)) + 
  theme(axis.text.x = element_text(angle = 45)) + 
  coord_cartesian(c(300,4000)) +
  facet_grid(cut~.) + 
  xlab("Price") + ylab("Count")

Q3.7

# a) Which cut has the highest priced diamond?
# Premium

by(diamonds$price, diamonds$cut, max)

## diamonds$cut: Fair
## [1] 18574
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 18788
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 18818
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 18823
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 18806

Q3.8

# In the two last exercises, we looked at
# the distribution for diamonds by cut.

# Run the code below in R Studio to generate
# the histogram as a reminder.

# ===============================================================

qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Q3.9

# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.

# Adjust the bin width and transform the scale
# of the x-axis using log10.

# Submit your final code when you are ready.

# ENTER YOUR CODE BELOW THIS LINE.
ggplot(diamonds, aes(x = price/carat)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = .05) + 
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_log10(expression(paste(Log[10], " of Price")),
                breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x))) + 
  facet_grid(cut~., scale = "free") + ylab("Count")

#Q3.10

# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.

diamonds %>%
  group_by(color) %>%
  summarise(count = n(), 
            avg_price = mean(price))

## Source: local data frame [7 x 3]
## 
##    color count avg_price
##   (fctr) (int)     (dbl)
## 1      D  6775  3169.954
## 2      E  9797  3076.752
## 3      F  9542  3724.886
## 4      G 11292  3999.136
## 5      H  8304  4486.669
## 6      I  5422  5091.875
## 7      J  2808  5323.818

# There are many more Ideal diamonds than others, but the average price is also the lowest.


ggplot(diamonds, aes(x = clarity, y = price, color = color)) + 
  geom_boxplot() + 
  facet_grid(cut~., margins = TRUE)

# This boxplot matrix shows the distribution of price across all 3 categorical variables; cut, clarity, and color.

Q3.11

#To know colors order
# by(diamonds$price, diamonds$color, summary)
# a) What is the price range for the middle 50% of the diamonds with color D?
# c) What is the IQR for diamonds with the best color?



diamonds %>%
  group_by(color) %>%
  filter(color == "D") %>%
  summarise(Quartile.25 = quantile(price, 0.25),
            Quartile.75 = quantile(price, 0.75),
            IQR = Quartile.75 - Quartile.25)

## Source: local data frame [1 x 4]
## 
##    color Quartile.25 Quartile.75    IQR
##   (fctr)       (dbl)       (dbl)  (dbl)
## 1      D         911      4213.5 3302.5

# b) What is the price range for the middle 50% of diamonds with color J?
# d) What is the IQR for the diamonds with the worst color?

diamonds %>%
  group_by(color) %>%
  filter(color == "J") %>%
  summarise(Quartile.25 = quantile(price, 0.25),
            Quartile.75 = quantile(price, 0.75),
            IQR = Quartile.75 - Quartile.25)

## Source: local data frame [1 x 4]
## 
##    color Quartile.25 Quartile.75    IQR
##   (fctr)       (dbl)       (dbl)  (dbl)
## 1      J      1860.5        7695 5834.5

Q3.12

# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.

ggplot(diamonds, aes(x = color, y = price/carat, fill = color)) +
  geom_boxplot() +
  coord_cartesian(ylim=c(1000, 6000)) +
  scale_y_continuous(labels=dollar) + 
  xlab("Color") + ylab("Price per Carat")

Q3.13

# Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.
# 0.3 and 1.01 in green
sizes = c(0.1, 0.3, 0.8, 1.01, 1.6, 2.0, 3.0, 5.0)

summary(diamonds$carat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7979  1.0400  5.0100

ggplot(diamonds, aes(x=carat)) + 
  geom_freqpoly(binwidth=0.1, alpha = 0.75) + 
  scale_x_continuous(breaks=sizes, expand = c(0,0)) +
  scale_y_continuous(expand=c(0,0))+
  geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "red", linetype="dashed", alpha = 0.75) +
  geom_vline(xintercept = c(0.3, 1.01), color = "forestgreen", linetype = "twodash") +
  geom_hline(yintercept = 2000, color = "brown", linetype="longdash", alpha = 0.5) +
  xlab("Carat Size") + ylab("Count")

#Q3.14

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.

# http://spreadsheets.google.com/pub?key=rdCufG2vozTpKw7TBGbyoWw&output=xls
hours <- tbl_df(read.xlsx("indicator_hours per week.xlsx", sheetName="Data", header=TRUE))
hours <- hours %>%
  select(-NA.) %>% # Remove NA column carried over from xlsx
  rename(Country = Working.hours.per.week) %>%
  filter(Country != "<NA>") # Remove <NA> row carried over from xlsx

hours.long <-  hours %>% gather("Year", "Hours",2:29)

hours.long <- hours.long %>%
  mutate(Year = as.character(Year), # Convert to character
         Year = substr(Year, 2, 5), # Slice out the X, leaving last 4 digits; R added X since initially since column names can't start with numbers.
         Year = as.numeric(Year))   # Cast as numeric

yearStats <- hours.long %>%
  group_by(Year) %>%
  summarise(median = median(Hours, na.rm=TRUE),
            mean = mean(Hours, na.rm=TRUE),
            lower = min(Hours, na.rm=TRUE),
            upper = max(Hours, na.rm=TRUE),
            se = sd(Hours, na.rm=TRUE)/sqrt(length(Hours)),
            avg_upper = mean + (2.101*se),
            avg_lower = mean - (2.101*se),
            quant.25 = quantile(Hours, na.rm=TRUE, 0.25),
            quant.75 = quantile(Hours, na.rm=TRUE, 0.75))

yearStats <- round(yearStats, 2)

p <- ggplot(yearStats, aes(x=Year, y=median)) + 
  theme(plot.background = element_blank(),
                panel.grid.minor = element_blank(),
                panel.grid.major = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.ticks = element_blank(),
                axis.title = element_blank()) +
    geom_linerange(yearStats, mapping=aes(x=Year, ymin=lower, ymax=upper), colour = "wheat2", alpha=1, size=5) + 
  geom_linerange(yearStats, mapping=aes(x=Year, ymin=quant.25, ymax=quant.75), colour = "wheat4", size=5) +
  geom_line(yearStats, mapping=aes(x=Year, y=median, group=1)) +
  geom_vline(xintercept = 1980, colour = "wheat4", linetype=1, size=1) + 
  geom_hline(yintercept=seq(26, 56, 2), color="white", linetype=1) 

dottedYears <- seq(1980, 2007, 5) # Pick the years to draw dotted vertical lines on

p <- p + geom_vline(xintercept = dottedYears, color="wheat4", linetype=3, size=0.5)

p <- p + coord_cartesian(ylim = c(26,58))+
  scale_y_continuous(breaks=seq(26, 60, 2)) +
  scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) )

p <- p + geom_line(data = subset(hours.long, Country == "Egypt"), aes(x = Year, y = Hours, group = Country), color ="brown") +
  annotate("segment", x=2000, xend=2002, y=35.5, yend=36, color="brown") +
  annotate("text", x=2003, y=36.3, label="Egypt Hours", size=3.5, color="brown") + 
  annotate("segment", x=2000, xend=2001, y=33.5, yend=32) + 
  annotate("text", x=2002, y=31.7, label="World Medians", size=3.5)


p1 <- p + annotate("text", x=1999.9, y=56, label="Data represents hours worked per week for 52 countries   ", size=3, color="gray30") + 
    annotate("text", x=2000, y=55, label="from 1980 to 2007. Outer lighter bands show the min/max  ", size=3, color="gray30") +
    annotate("text", x=2000, y=54, label="hours for each year, and inner darker bands show the IQR.", size=3, color="gray30") + 
    ggtitle("World's Working Hours") +
    theme(plot.title=element_text(face="bold",hjust=.95,vjust=.8,color="#3C3C3C",size=20)) + 
    annotate("text", x=1994.6, y=57.5, label="Weekly", size=4, fontface="bold")

p1

Q3.15

# How many birthdays are in each month?

# Which day of the year has the most number of birthdays?

# Do you have at least 365 friends that have birthdays on everyday
# of the year?

# **********************************************************************

# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.

# You can expect to spend 30 min or more on this task depending on if
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.

# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friendsâ birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.

# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.


birthdays <- tbl_df(read.csv("birthdaysExample.csv"))

tempDates <- mdy(birthdays$dates)

birthdays <- birthdays %>%
  mutate(Birthday = tempDates,
         Year = year(tempDates),
         Month = month(tempDates, label=TRUE, abbr=FALSE),
         Day = day(tempDates),
         Weekday = weekdays(tempDates, abbr=FALSE))


birthdays$Weekday <- factor(birthdays$Weekday, levels=c('Monday', 'Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'), ordered=TRUE)

 # ifelse didn't seem to work here for creating 'optional' since the else would create a NULL vector within the function and R complained.
 # creates optional ggplot argument only if Day is passed in.
 # aes_string takes in a string as the column name for plotting. Perfect for our arguments we pass in as strings.
 # Paste our argument in for custom X and Y-axis labeling

makePlots<- function(TimeLength){

  optional<- NULL
  
  if(TimeLength == "Day")  optional<- scale_x_discrete(breaks = seq(1, 31, 1)) 
  if(TimeLength == "Month")  optional<- scale_x_discrete(breaks = seq(1, 12, 1)) 
  if(TimeLength == "Month")  optional<- scale_x_discrete(breaks = seq(1, 12, 1)) 

  ggplot(birthdays, aes_string(x = TimeLength, fill = TimeLength)) + 
    geom_bar() + 
    scale_fill_brewer(palette="Paired") + 
    xlab(TimeLength)+
    ylab(paste0("Number of Birthdays per ", TimeLength)) + 
    theme_tufte() + 
    optional

}
    
makePlots("Month")

makePlots("Day")

makePlots("Weekday")

EDA Project: Diamond Prices (Part 1)

/mohammed

February 26, 2016

Q3.1

Q3.2

Q3.4

Q3.5

Q3.6

Q3.7

Q3.8

Q3.9

Q3.11

Q3.12

Q3.13

Q3.15