1 Directions

The problem set is worth 100 points.

Enter your answers in the empty code chunks. Replace “# your code here” with your code.

Make sure you run this chunk before attempting any of the problems:

library(tidyverse)

2 Basics

Calculate \(2+2\):

2+2 
## [1] 4

Calculate \(2*3\):

2 * 3
## [1] 6

Calculate \(\frac{(2+2)\times (3^2 + 5)}{(6/4)}\):

(2+2) * (3^2 + 5) / (6/4)
## [1] 37.33333

3 dplyr

Let’s work with the data set diamonds:

library(ggplot2)
library(dplyr)

data(diamonds) # this will load a dataset called "diamonds"

Calculate the average price of a diamond. Use the %>% and summarise() syntax (hint: see lectures).

average_price <- diamonds %>%
  summarise(avg_price = mean(price, na.rm = TRUE))

price <- average_price$avg_price

print (paste("Average price:", price))
## [1] "Average price: 3932.79972191324"

Calculate the average, median and standard deviation price of a diamond. Use the %>% and summarise() syntax.

##Average
average_price <- diamonds %>%
  summarise(avg_price = mean(price, na.rm = TRUE))

avg_price <- average_price$avg_price

##Median
Med <- diamonds %>%
  summarise(med_price = median(price, na.rm = TRUE))

Median <- Med$med_price

##STD
std <- diamonds %>%
  summarise(std_price = sd(price, na.rm = TRUE))

Std <- std$std_price

print (paste("Average price:", avg_price))
## [1] "Average price: 3932.79972191324"
print (paste("Median price:", Median))
## [1] "Median price: 2401"
print (paste("Standard Deviation:", Std))
## [1] "Standard Deviation: 3989.43973814638"

Use group_by() to group diamonds by color, then use summarise() to calculate the average price and the standard deviation in price by color:

group_by(diamonds,color)
## # A tibble: 53,940 × 10
## # Groups:   color [7]
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ℹ 53,930 more rows
summary_by_color <- diamonds %>% 
  group_by(color) %>%
  summarise(
    avg= mean(price, na.rm = TRUE),
    med = median(price, na.rm = TRUE),
    std = sd(price, na.rm = TRUE)
  )

print(summary_by_color)
## # A tibble: 7 × 4
##   color   avg   med   std
##   <ord> <dbl> <dbl> <dbl>
## 1 D     3170. 1838  3357.
## 2 E     3077. 1739  3344.
## 3 F     3725. 2344. 3785.
## 4 G     3999. 2242  4051.
## 5 H     4487. 3460  4216.
## 6 I     5092. 3730  4722.
## 7 J     5324. 4234  4438.

Use filter() to remove observations with a depth greater than 62, then usegroup_by() to group diamonds by clarity, then use summarise() to find the maximum price of a diamond by clarity:

##Filter by depths greater than 62
filtered_diamonds <- filter(diamonds, diamonds$z <= 62)

##Group by clarity
filtered_diamonds <- group_by(filtered_diamonds, filtered_diamonds$clarity)

##Max price
max_price_clarity <- summarise(filtered_diamonds,max_price = max(price, na.rm = FALSE))

print(max_price_clarity)
## # A tibble: 8 × 2
##   `filtered_diamonds$clarity` max_price
##   <ord>                           <int>
## 1 I1                              18531
## 2 SI2                             18804
## 3 SI1                             18818
## 4 VS2                             18823
## 5 VS1                             18795
## 6 VVS2                            18768
## 7 VVS1                            18777
## 8 IF                              18806

Use mutate() and log() to create a new variable to the data called “log_price”. Make sure you add the variable to the dataset diamonds.

##Use "diamonds<-" to store the newly mutated data frame
diamonds <- mutate(diamonds,log_price = log(diamonds$price))

#Check headers for "log_price"
column_check <- any(names(diamonds)=="log_price")

print(paste("log_price status: ", column_check))
## [1] "log_price status:  TRUE"

(Hint: if I wanted to add a variable called “max_price” that calculates the max price, the code would look like this:)

diamonds = diamonds %>% 
  mutate(max_price = max(price))

4 ggplot2

Continue using diamonds.

Use geom_histogram() to plot a histogram of prices:

ggplot(diamonds, aes(x = price))+
  geom_histogram(binwidth = 500, fill = "blue", color = "black") +
  labs(title = "Histogram of Diamond Prices", x = "Price", y = "Count")

Use geom_density() to plot the density of log prices (the variable you added to the data frame):

ggplot(diamonds, aes(x = log_price))+
  geom_density(binwidth = 500, fill = "blue", color = "black") +
  labs(title = "Density of log_price", x = "log_price", y = "Density")
## Warning in geom_density(binwidth = 500, fill = "blue", color = "black"):
## Ignoring unknown parameters: `binwidth`

Use geom_point() to plot carats against log prices (i.e. carats on the x-axis, log prices on the y-axis):

ggplot(diamonds, aes(x = carat,y = log_price))+
  geom_point(binwidth = 500, fill = "blue", color = "black") +
  labs(title = "Carats against log_price", y = "log_price", x = "Crats")
## Warning in geom_point(binwidth = 500, fill = "blue", color = "black"): Ignoring
## unknown parameters: `binwidth`

Same as above, but now add a regression line with geom_smooth():

ggplot(diamonds, aes(x = carat, y = log_price)) +
  geom_point(color = "black") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Carats against log_price with Regression Line", x = "Carat", y = "log_price")
## `geom_smooth()` using formula = 'y ~ x'

Use stat_summary() to make a bar plot of average log price by cut:

ggplot(diamonds, aes(x = cut, y = log_price)) +
 stat_summary(fun = mean, geom = "bar", fill = "blue", color = "black") +
  labs(title = "Average log_price by Cut", x = "Cut", y = "Average log_price")

Same as above but change the theme to theme_classic():

ggplot(diamonds, aes(x = cut, y = log_price)) +
 stat_summary(fun = mean, geom = "bar", fill = "blue", color = "black") +
  labs(title = "Average log_price by Cut", x = "Cut", y = "Average log_price") +   theme_classic()

5 Inference

Use lm() to estimate the model

\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \varepsilon \]

and store the output in an object called “m1”:

m1 <- lm(diamonds$log_price ~ diamonds$carat + diamonds$table)

Use summary() to view the output of “m1”:

summary(m1)
## 
## Call:
## lm(formula = diamonds$log_price ~ diamonds$carat + diamonds$table)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2930 -0.2453  0.0338  0.2571  1.5573 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.4527654  0.0443008 145.658  < 2e-16 ***
## diamonds$carat  1.9733423  0.0036678 538.015  < 2e-16 ***
## diamonds$table -0.0041876  0.0007781  -5.382  7.4e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3971 on 53937 degrees of freedom
## Multiple R-squared:  0.8469, Adjusted R-squared:  0.8469 
## F-statistic: 1.491e+05 on 2 and 53937 DF,  p-value: < 2.2e-16

Use lm() to estimate the model

\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \beta_3 \text{depth} + \varepsilon \]

and store the output in an object called “m2”:

m2 <- lm(diamonds$log_price ~ diamonds$carat + diamonds$table + diamonds$z)

Use summary() to view the output of “m2”:

summary(m2)
## 
## Call:
## lm(formula = diamonds$log_price ~ diamonds$carat + diamonds$table + 
##     diamonds$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.8968  -0.1912   0.0100   0.1933   4.8441 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.8705421  0.0433889  89.206  < 2e-16 ***
## diamonds$carat 0.6671196  0.0105041  63.510  < 2e-16 ***
## diamonds$table 0.0024626  0.0006802   3.621 0.000294 ***
## diamonds$z     0.9162626  0.0070186 130.548  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3461 on 53936 degrees of freedom
## Multiple R-squared:  0.8836, Adjusted R-squared:  0.8836 
## F-statistic: 1.365e+05 on 3 and 53936 DF,  p-value: < 2.2e-16