The problem set is worth 100 points.
Enter your answers in the empty code chunks. Replace “# your code here” with your code.
Make sure you run this chunk before attempting any of the problems:
library(ggplot2)
library(tidyverse)
2+2
## [1] 4
2*3
## [1] 6
((2+2)*((3^2)+5))/(6/4)
## [1] 37.33333
dplyrLet’s work with the data set diamonds:
data(diamonds) # this will load a dataset called "diamonds"
ggplot(diamonds, aes(x = price)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Probability Plot of Diamond Prices",
x = "Price (USD)",
y = "Density") +
theme_minimal()
%>% and
summarise() syntax (hint: see lectures).Mean <- diamonds %>%
summarise(AvgDiamondP = mean(diamonds$price, na.rm = TRUE))
print(Mean)
## # A tibble: 1 × 1
## AvgDiamondP
## <dbl>
## 1 3933.
%>% and summarise() syntax.Stats <- diamonds %>%
summarise(
AvgDiamondP = mean(price, na.rm = TRUE),
MedDiamondP = median(price, na.rm = TRUE),
StdDiamondP = sqrt(var(price, na.rm = TRUE))
)
print(Stats)
## # A tibble: 1 × 3
## AvgDiamondP MedDiamondP StdDiamondP
## <dbl> <dbl> <dbl>
## 1 3933. 2401 3989.
group_by() to group diamonds by color,
then use summarise() to calculate the average price
and the standard deviation in price by
color:levels(diamonds$color)
## [1] "D" "E" "F" "G" "H" "I" "J"
ColorsStats <- diamonds %>%
group_by(color) %>%
summarise(
AvgDiamondP = mean(price, na.rm = TRUE),
StdDiamondP = sqrt(var(price, na.rm = TRUE))
)
print(ColorsStats)
## # A tibble: 7 × 3
## color AvgDiamondP StdDiamondP
## <ord> <dbl> <dbl>
## 1 D 3170. 3357.
## 2 E 3077. 3344.
## 3 F 3725. 3785.
## 4 G 3999. 4051.
## 5 H 4487. 4216.
## 6 I 5092. 4722.
## 7 J 5324. 4438.
filter() to remove observations with a depth greater than
62, then usegroup_by() to group diamonds by
clarity, then use summarise() to find the
maximum price of a diamond by clarity:Section <- diamonds %>%
filter(diamonds$depth < 62)
nrow(diamonds)
## [1] 53940
nrow(Section)
## [1] 29271
clarity <- levels(Section$clarity)
clarity
## [1] "I1" "SI2" "SI1" "VS2" "VS1" "VVS2" "VVS1" "IF"
MaxPrice <- Section %>%
group_by(clarity) %>%
summarise(MaxDiamondP = max(price, na.rm = TRUE))
print(MaxPrice)
## # A tibble: 8 × 2
## clarity MaxDiamondP
## <ord> <int>
## 1 I1 15223
## 2 SI2 18784
## 3 SI1 18797
## 4 VS2 18823
## 5 VS1 18795
## 6 VVS2 18730
## 7 VVS1 18682
## 8 IF 18806
mutate() and log() to create a new variable to
the data called “log_price”. Make sure you add the variable to the
dataset diamonds.Section <- Section %>%
mutate(LogPrice = log(Section$price))
head(Section)
## # A tibble: 6 × 11
## carat cut color clarity depth table price x y z LogPrice
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 5.79
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 5.79
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 5.79
## 4 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 5.82
## 5 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 5.82
## 6 0.22 Premium F SI1 60.4 61 342 3.88 3.84 2.33 5.83
(Hint: if I wanted to add a variable called “max_price” that calculates the max price, the code would look like this:)
diamonds = diamonds %>%
mutate(max_price = max(price))
ggplot2Continue using diamonds.
geom_histogram() to plot a histogram of prices:ggplot(Section, aes(x = price)) +
geom_histogram(binwidth = 500, fill = "blue", color = "black", alpha = 0.5) +
labs(title = "Histogram of Diamond Prices",
x = "Price (USD)",
y = "Frequency") +
theme_minimal()
geom_density() to plot the density of log prices
(the variable you added to the data frame):ggplot(Section, aes(x = LogPrice)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Probability Plot of the Natural Log of Diamond Prices",
x = "log(Price (USD))",
y = "Density") +
theme_minimal()
geom_point() to plot carats against log prices (i.e. carats
on the x-axis, log prices on the y-axis):ggplot(Section, aes(x = carat, y = LogPrice)) +
geom_point(alpha = 0.5, color = "blue") +
labs(title = "Scatter Plot of Carats vs Log Prices",
x = "Carats",
y = "Log of Price (USD)") +
theme_minimal()
geom_smooth():ggplot(Section, aes(x = carat, y = LogPrice)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(title = "Scatter Plot of Carats vs Log Prices",
x = "Carats",
y = "Log of Price (USD)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
stat_summary() to make a bar plot of average log
price by cut:ggplot(Section, aes(x = cut, y = LogPrice)) +
stat_summary(fun = mean, geom = "bar", fill = "skyblue", color = "black") +
labs(title = "Bar Plot of Average Log Price by Cut",
x = "Cut",
y = "Average Log Price (USD)") +
theme_minimal()
theme_classic():ggplot(Section, aes(x = cut, y = LogPrice)) +
stat_summary(fun = mean, geom = "bar", fill = "skyblue", color = "black") +
labs(title = "Bar Plot of Average Log Price by Cut",
x = "Cut",
y = "Average Log Price (USD)") +
theme_classic()
lm()
to estimate the model and store the output in an object called
“m1”:\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \varepsilon \]
m1 <- lm(data = Section, LogPrice ~ carat + table)
summary() to view the output of “m1”:summary(m1)
##
## Call:
## lm(formula = LogPrice ~ carat + table, data = Section)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6148 -0.2378 0.0291 0.2492 1.5096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0594532 0.0563898 125.19 <2e-16 ***
## carat 2.0194171 0.0049120 411.12 <2e-16 ***
## table -0.0149499 0.0009848 -15.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3845 on 29268 degrees of freedom
## Multiple R-squared: 0.8554, Adjusted R-squared: 0.8554
## F-statistic: 8.656e+04 on 2 and 29268 DF, p-value: < 2.2e-16
lm()
to estimate the model and store the output in an object called
“m2”:\[ log(\text{price}) = \beta_0 + \beta_1 \text{carat} + \beta_2 \text{table} + \beta_3 \text{depth} + \varepsilon \]
m2 <- lm(data = Section, LogPrice ~ carat + table + depth)
summary() to view the output of “m2”:summary(m2)
##
## Call:
## lm(formula = LogPrice ~ carat + table + depth, data = Section)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6113 -0.2372 0.0293 0.2492 1.5132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.284706 0.187696 38.811 <2e-16 ***
## carat 2.019165 0.004916 410.732 <2e-16 ***
## table -0.015624 0.001121 -13.937 <2e-16 ***
## depth -0.003060 0.002432 -1.258 0.208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3845 on 29267 degrees of freedom
## Multiple R-squared: 0.8554, Adjusted R-squared: 0.8554
## F-statistic: 5.771e+04 on 3 and 29267 DF, p-value: < 2.2e-16