The problem set is worth 100 points.
Enter your answers in the empty code chunks. Replace “# your code here” with your code.
Make sure you run this chunk before attempting any of the problems:
library(tidyverse)
Calculate \(2+2\):
2+2
## [1] 4
Calculate \(2*3\):
2 * 3
## [1] 6
Calculate \(\frac{(2+2)\times (3^2 + 5)}{(6/4)}\):
(2+2) * (3^2 + 5) / (6/4)
## [1] 37.33333
dplyr
Let’s work with the data set diamonds
:
library(ggplot2)
library(dplyr)
data(diamonds) # this will load a dataset called "diamonds"
Calculate the average price of a diamond. Use the %>%
and summarise()
syntax (hint: see lectures).
average_price <- diamonds %>%
summarise(avg_price = mean(price, na.rm = TRUE))
price <- average_price$avg_price
print (paste("Average price:", price))
## [1] "Average price: 3932.79972191324"
Calculate the average, median and standard deviation price of a
diamond. Use the %>%
and summarise()
syntax.
##Average
average_price <- diamonds %>%
summarise(avg_price = mean(price, na.rm = TRUE))
avg_price <- average_price$avg_price
##Median
Med <- diamonds %>%
summarise(med_price = median(price, na.rm = TRUE))
Median <- Med$med_price
##STD
std <- diamonds %>%
summarise(std_price = sd(price, na.rm = TRUE))
Std <- std$std_price
print (paste("Average price:", avg_price))
## [1] "Average price: 3932.79972191324"
print (paste("Median price:", Median))
## [1] "Median price: 2401"
print (paste("Standard Deviation:", Std))
## [1] "Standard Deviation: 3989.43973814638"
Use group_by()
to group diamonds by
color, then use summarise()
to calculate
the average price and the standard deviation in price
by color:
group_by(diamonds,color)
## # A tibble: 53,940 × 10
## # Groups: color [7]
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ℹ 53,930 more rows
summary_by_color <- diamonds %>%
group_by(color) %>%
summarise(
avg= mean(price, na.rm = TRUE),
med = median(price, na.rm = TRUE),
std = sd(price, na.rm = TRUE)
)
print(summary_by_color)
## # A tibble: 7 × 4
## color avg med std
## <ord> <dbl> <dbl> <dbl>
## 1 D 3170. 1838 3357.
## 2 E 3077. 1739 3344.
## 3 F 3725. 2344. 3785.
## 4 G 3999. 2242 4051.
## 5 H 4487. 3460 4216.
## 6 I 5092. 3730 4722.
## 7 J 5324. 4234 4438.
Use filter()
to remove observations with a depth greater
than 62, then usegroup_by()
to group diamonds by
clarity, then use summarise()
to find the
maximum price of a diamond by clarity:
##Filter by depths greater than 62
filtered_diamonds <- filter(diamonds, diamonds$z <= 62)
##Group by clarity
filtered_diamonds <- group_by(filtered_diamonds, filtered_diamonds$clarity)
##Max price
max_price_clarity <- summarise(filtered_diamonds,max_price = max(price, na.rm = FALSE))
print(max_price_clarity)
## # A tibble: 8 × 2
## `filtered_diamonds$clarity` max_price
## <ord> <int>
## 1 I1 18531
## 2 SI2 18804
## 3 SI1 18818
## 4 VS2 18823
## 5 VS1 18795
## 6 VVS2 18768
## 7 VVS1 18777
## 8 IF 18806
Use mutate()
and log()
to create a new
variable to the data called “log_price”. Make sure you add the variable
to the dataset diamonds
.
##Use "diamonds<-" to store the newly mutated data frame
diamonds <- mutate(diamonds,log_price = log(diamonds$price))
#Check headers for "log_price"
column_check <- any(names(diamonds)=="log_price")
print(paste("log_price status: ", column_check))
## [1] "log_price status: TRUE"
(Hint: if I wanted to add a variable called “max_price” that calculates the max price, the code would look like this:)
diamonds = diamonds %>%
mutate(max_price = max(price))
ggplot2
Continue using diamonds
.
Use geom_histogram()
to plot a histogram of prices:
ggplot(diamonds, aes(x = price))+
geom_histogram(binwidth = 500, fill = "blue", color = "black") +
labs(title = "Histogram of Diamond Prices", x = "Price", y = "Count")
Use geom_density()
to plot the density of log
prices (the variable you added to the data frame):
ggplot(diamonds, aes(x = log_price))+
geom_density(binwidth = 500, fill = "blue", color = "black") +
labs(title = "Density of log_price", x = "log_price", y = "Density")
## Warning in geom_density(binwidth = 500, fill = "blue", color = "black"):
## Ignoring unknown parameters: `binwidth`
Use geom_point()
to plot carats against log prices
(i.e. carats on the x-axis, log prices on the y-axis):
ggplot(diamonds, aes(x = carat,y = log_price))+
geom_point(binwidth = 500, fill = "blue", color = "black") +
labs(title = "Carats against log_price", y = "log_price", x = "Crats")
## Warning in geom_point(binwidth = 500, fill = "blue", color = "black"): Ignoring
## unknown parameters: `binwidth`
Same as above, but now add a regression line with
geom_smooth()
:
ggplot(diamonds, aes(x = carat, y = log_price)) +
geom_point(color = "black") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Carats against log_price with Regression Line", x = "Carat", y = "log_price")
## `geom_smooth()` using formula = 'y ~ x'
Use stat_summary()
to make a bar plot of average
log price by cut:
ggplot(diamonds, aes(x = cut, y = log_price)) +
stat_summary(fun = mean, geom = "bar", fill = "blue", color = "black") +
labs(title = "Average log_price by Cut", x = "Cut", y = "Average log_price")
Same as above but change the theme to
theme_classic()
:
ggplot(diamonds, aes(x = cut, y = log_price)) +
stat_summary(fun = mean, geom = "bar", fill = "blue", color = "black") +
labs(title = "Average log_price by Cut", x = "Cut", y = "Average log_price") + theme_classic()
Use lm()
to estimate the model
\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \varepsilon \]
and store the output in an object called “m1”:
m1 <- lm(diamonds$log_price ~ diamonds$carat + diamonds$table)
Use summary()
to view the output of “m1”:
summary(m1)
##
## Call:
## lm(formula = diamonds$log_price ~ diamonds$carat + diamonds$table)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2930 -0.2453 0.0338 0.2571 1.5573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4527654 0.0443008 145.658 < 2e-16 ***
## diamonds$carat 1.9733423 0.0036678 538.015 < 2e-16 ***
## diamonds$table -0.0041876 0.0007781 -5.382 7.4e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3971 on 53937 degrees of freedom
## Multiple R-squared: 0.8469, Adjusted R-squared: 0.8469
## F-statistic: 1.491e+05 on 2 and 53937 DF, p-value: < 2.2e-16
Use lm()
to estimate the model
\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \beta_3 \text{depth} + \varepsilon \]
and store the output in an object called “m2”:
m2 <- lm(diamonds$log_price ~ diamonds$carat + diamonds$table + diamonds$z)
Use summary()
to view the output of “m2”:
summary(m2)
##
## Call:
## lm(formula = diamonds$log_price ~ diamonds$carat + diamonds$table +
## diamonds$z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.8968 -0.1912 0.0100 0.1933 4.8441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8705421 0.0433889 89.206 < 2e-16 ***
## diamonds$carat 0.6671196 0.0105041 63.510 < 2e-16 ***
## diamonds$table 0.0024626 0.0006802 3.621 0.000294 ***
## diamonds$z 0.9162626 0.0070186 130.548 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3461 on 53936 degrees of freedom
## Multiple R-squared: 0.8836, Adjusted R-squared: 0.8836
## F-statistic: 1.365e+05 on 3 and 53936 DF, p-value: < 2.2e-16