BA Group 10 Codes

Quarto

Hümay Tuğçe Okunakol, Inesa Habelaia, Senna Periviita and Brendan Friberg

Packages

#warning: false 
#echo: false
library(ggplot2)
library(rio)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(hexbin)
library(scales)


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

library(ggridges)
library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(dplyr)
library(broom)
library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(caret)

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library(ggcorrplot)
library(lmtest)

Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

library(sandwich)
diamonds <- diamonds 
view(diamonds)
summary(diamonds)

     carat               cut        color        clarity          depth      
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
 Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
                                    J: 2808   (Other): 2531                  
     table           price             x                y         
 Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
 1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
 Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
 Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
 3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
 Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
                                                                  
       z         
 Min.   : 0.000  
 1st Qu.: 2.910  
 Median : 3.530  
 Mean   : 3.539  
 3rd Qu.: 4.040  
 Max.   :31.800

Boxplots

ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Price ~ Cut",
    x = "Cut (from worst to best quality)",
    y = "Price"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

ggplot(diamonds, aes(x = color, y = price, fill = color)) +
  geom_boxplot() +
  scale_fill_brewer(palette = "Blues") +
  labs(title = "Diamond Price Distribution by Color",
       x = "Color Grade (D = Clear, J = Intense Color)",
       y = "Price (USD)") +
  theme_minimal()

ggplot(diamonds, aes(x = clarity, y = price, fill = clarity)) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Price & clarity",
    x = "clarity",
    y = "Price"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

Price vs. Carat(Scatter Plot)

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "gam", formula = y ~ s(x), se = FALSE, color = "red") +
  labs(title = "Price - Carat",
       x = "Carat",
       y = "Price")

Price vs. Carat(Heatmap)

#This heat map visualizes the relationship between diamond carat and price where color intensity represents the number of diamonds in each area.
 ggplot(diamonds, aes(carat, price)) +
    geom_hex(bins = 40) +
     scale_y_log10(labels=scales::comma) +
     labs(title="Price ~ Carat (hexbin, log10 price)") +
guides(fill=guide_colorbar(title="Count"))

Distribution of Numerical Variables

#Purpose: See skewness, outliers, and typical ranges.
diamonds %>%
    select(price, carat, depth, table, x, y, z) %>%
    pivot_longer(everything()) %>%
    ggplot(aes(value)) +
    geom_histogram(bins = 40) +
    facet_wrap(~name, scales = "free") +
    labs(title="Distributions of Numerical Variables")

Cleaning Data

# Remove potential outliers (x,y,z = 0 are invalid)
diamonds_clean <- diamonds %>%
  filter(x > 0, y > 0, z > 0)

# Log-transform price and carat to handle right-skewed distributions
diamonds_clean <- diamonds_clean %>%
  mutate(log_price = log(price),
         log_carat = log(carat))

Summary of Data

numeric_summary <- diamonds %>%
  select(carat, depth, table, x, y, z, price) %>%
  summarise(across(
    everything(),
    list(
      Mean = ~mean(.),
      Median = ~median(.),
      SD = ~sd(.),
      Min = ~min(.),
      Max = ~max(.)
    ),
    .names = "{.col}_{.fn}"
  )) %>%
  tidyr::pivot_longer(everything(),
                      names_to = c("Variable", ".value"),
                      names_sep = "_")
numeric_summary <- numeric_summary %>%
  mutate(across(c(Mean, Median, SD, Min, Max), ~round(., 2)))

numeric_summary

# A tibble: 7 × 6
  Variable    Mean  Median      SD   Min      Max
  <chr>      <dbl>   <dbl>   <dbl> <dbl>    <dbl>
1 carat       0.8     0.7     0.47   0.2     5.01
2 depth      61.8    61.8     1.43  43      79   
3 table      57.5    57       2.23  43      95   
4 x           5.73    5.7     1.12   0      10.7 
5 y           5.73    5.71    1.14   0      58.9 
6 z           3.54    3.53    0.71   0      31.8 
7 price    3933.   2401    3989.   326   18823

Log(Price)

# Distribution of log(price)
ggplot(diamonds_clean, aes(x = log_price)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "white") +
  labs(title = "Distribution of log(Price)", x = "log(Price)", y = "Count")

Correlation Heatmap of Numerical Data

# Correlation matrix for numerical variables
num_vars <- diamonds_clean %>% 
  select(carat, depth, table, x, y, z, price)
corr <- cor(num_vars)
ggcorrplot(corr, lab = TRUE, title = "Correlation Heatmap of Numerical Variables")

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
  Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.

Models

# Step-by-step model construction

# Baseline model: log(price) ~ log(carat)
model1 <- lm(log_price ~ log_carat, data = diamonds_clean)

# Add cut, color, clarity as categorical predictors
model2 <- lm(log_price ~ log_carat + cut + color + clarity, data = diamonds_clean)

# Full model: add geometric features
model3 <- lm(log_price ~ log_carat + cut + color + clarity + depth + table, data = diamonds_clean)
# Get tidy summary
results <- tidy(model2)
print(results)

# A tibble: 19 × 5
   term        estimate std.error statistic   p.value
   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
 1 (Intercept)  8.46      0.00117   7239.   0        
 2 log_carat    1.88      0.00113   1669.   0        
 3 cut.L        0.121     0.00235     51.3  0        
 4 cut.Q       -0.0351    0.00207    -17.0  2.44e- 64
 5 cut.C        0.0135    0.00180      7.51 5.92e- 14
 6 cut^4       -0.00161   0.00144     -1.11 2.65e-  1
 7 color.L     -0.440     0.00203   -217.   0        
 8 color.Q     -0.0957    0.00186    -51.4  0        
 9 color.C     -0.0148    0.00174     -8.47 2.55e- 17
10 color^4      0.0119    0.00160      7.42 1.15e- 13
11 color^5     -0.00222   0.00151     -1.47 1.42e-  1
12 color^6      0.00229   0.00138      1.67 9.58e-  2
13 clarity.L    0.917     0.00358    256.   0        
14 clarity.Q   -0.243     0.00333    -72.9  0        
15 clarity.C    0.132     0.00286     46.3  0        
16 clarity^4   -0.0661    0.00228    -28.9  1.35e-182
17 clarity^5    0.0275    0.00186     14.7  4.70e- 49
18 clarity^6   -0.00181   0.00162     -1.11 2.65e-  1
19 clarity^7    0.0336    0.00143     23.4  8.58e-121

summary(model1)


Call:
lm(formula = log_price ~ log_carat, data = diamonds_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.50852 -0.16951 -0.00594  0.16631  1.33796 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 8.448748   0.001365  6189.6   <2e-16 ***
log_carat   1.675908   0.001934   866.5   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2626 on 53918 degrees of freedom
Multiple R-squared:  0.933, Adjusted R-squared:  0.933 
F-statistic: 7.509e+05 on 1 and 53918 DF,  p-value: < 2.2e-16

summary(model2)


Call:
lm(formula = log_price ~ log_carat + cut + color + clarity, data = diamonds_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.01111 -0.08636 -0.00031  0.08342  1.94779 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  8.457037   0.001168 7238.916  < 2e-16 ***
log_carat    1.883745   0.001129 1668.528  < 2e-16 ***
cut.L        0.120737   0.002354   51.281  < 2e-16 ***
cut.Q       -0.035142   0.002072  -16.959  < 2e-16 ***
cut.C        0.013515   0.001799    7.512 5.92e-14 ***
cut^4       -0.001605   0.001441   -1.114   0.2653    
color.L     -0.439561   0.002027 -216.836  < 2e-16 ***
color.Q     -0.095702   0.001863  -51.378  < 2e-16 ***
color.C     -0.014760   0.001743   -8.468  < 2e-16 ***
color^4      0.011885   0.001601    7.424 1.15e-13 ***
color^5     -0.002219   0.001513   -1.467   0.1425    
color^6      0.002291   0.001375    1.666   0.0958 .  
clarity.L    0.916717   0.003582  255.950  < 2e-16 ***
clarity.Q   -0.243015   0.003334  -72.883  < 2e-16 ***
clarity.C    0.132430   0.002857   46.347  < 2e-16 ***
clarity^4   -0.066091   0.002285  -28.928  < 2e-16 ***
clarity^5    0.027474   0.001864   14.736  < 2e-16 ***
clarity^6   -0.001810   0.001624   -1.115   0.2650    
clarity^7    0.033555   0.001432   23.430  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1338 on 53901 degrees of freedom
Multiple R-squared:  0.9826,    Adjusted R-squared:  0.9826 
F-statistic: 1.693e+05 on 18 and 53901 DF,  p-value: < 2.2e-16

summary(model3)


Call:
lm(formula = log_price ~ log_carat + cut + color + clarity + 
    depth + table, data = diamonds_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.01154 -0.08632 -0.00032  0.08342  1.94116 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  8.5341918  0.0418202  204.068  < 2e-16 ***
log_carat    1.8838300  0.0011349 1659.875  < 2e-16 ***
cut.L        0.1184744  0.0026612   44.519  < 2e-16 ***
cut.Q       -0.0344439  0.0021294  -16.176  < 2e-16 ***
cut.C        0.0131590  0.0018302    7.190 6.56e-13 ***
cut^4       -0.0016521  0.0014640   -1.128   0.2591    
color.L     -0.4393955  0.0020292 -216.533  < 2e-16 ***
color.Q     -0.0956751  0.0018629  -51.358  < 2e-16 ***
color.C     -0.0148027  0.0017432   -8.492  < 2e-16 ***
color^4      0.0118843  0.0016011    7.423 1.16e-13 ***
color^5     -0.0021937  0.0015127   -1.450   0.1470    
color^6      0.0022783  0.0013753    1.657   0.0976 .  
clarity.L    0.9163025  0.0035879  255.384  < 2e-16 ***
clarity.Q   -0.2429325  0.0033346  -72.852  < 2e-16 ***
clarity.C    0.1322757  0.0028584   46.276  < 2e-16 ***
clarity^4   -0.0660370  0.0022849  -28.901  < 2e-16 ***
clarity^5    0.0273520  0.0018654   14.663  < 2e-16 ***
clarity^6   -0.0017544  0.0016238   -1.080   0.2799    
clarity^7    0.0335504  0.0014321   23.427  < 2e-16 ***
depth       -0.0009117  0.0004723   -1.931   0.0535 .  
table       -0.0003510  0.0003450   -1.017   0.3090    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1338 on 53899 degrees of freedom
Multiple R-squared:  0.9826,    Adjusted R-squared:  0.9826 
F-statistic: 1.523e+05 on 20 and 53899 DF,  p-value: < 2.2e-16

glance(model1)

# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.933         0.933 0.263   750873.       0     1 -4411. 8828. 8855.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

glance(model2)

# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic p.value    df logLik     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>
1     0.983         0.983 0.134   169267.       0    18 31961. -63883. -63705.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

glance(model3)

# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic p.value    df logLik     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>
1     0.983         0.983 0.134   152346.       0    20 31963. -63882. -63687.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Coefficient Estimates (Bar Plot)

# Model
model <- lm(log(price) ~ log(carat) + cut + color + clarity, data = diamonds)

coef_df <- broom::tidy(model) %>%
  filter(term != "(Intercept)") %>%  
  mutate(term = reorder(term, estimate))

ggplot(coef_df, aes(x = term, y = estimate, fill = estimate > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Coefficient Estimates (Bar Plot)",
    x = "Predictor",
    y = "Estimate (log-price scale)"
  ) +
  theme_minimal(base_size = 13)

Residuals Diagnostics

# Residual plots
par(mfrow=c(2,2))
plot(model2)

Residual Density Plot

model <- lm(log(price) ~ log(carat) + cut + color + clarity, data = diamonds)

# Kalıntıları (residuals) çıkar
residuals_df <- data.frame(Residuals = resid(model))

# Yoğunluk grafiği (density plot)
ggplot(residuals_df, aes(x = Residuals)) +
  geom_density(fill = "steelblue", alpha = 0.6) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Residual Density Plot",
    x = "Residuals (log-price scale)",
    y = "Density"
  ) +
  theme_minimal(base_size = 13)

Cook’s Distance

model <- lm(log(price) ~ log(carat) + cut + color + clarity, data = diamonds)

# Influence plot (Cook's Distance vs Leverage)
influencePlot(model, 
              id.method = "identify", 
              main = "Influence Plot (Cook's Distance vs Leverage)",
              sub = "Circle size represents Cook's Distance")

Warning in plot.window(...): "id.method" is not a graphical parameter

Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter

Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
a graphical parameter

Warning in box(...): "id.method" is not a graphical parameter

Warning in title(...): "id.method" is not a graphical parameter

Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
graphical parameter

        StudRes          Hat        CookD
27416 -4.356775 0.0020929076 0.0020945550
27631 -2.633114 0.0020777110 0.0007596723
46477 10.918700 0.0008652483 0.0054219293
49774 14.599569 0.0018859637 0.0211141828

influence_data <- data.frame(
  hat = hatvalues(model),
  stud_resid = rstudent(model),
  cooks = cooks.distance(model)
)

ggplot(influence_data, aes(x = hat, y = stud_resid)) +
  geom_point(aes(size = cooks), alpha = 0.5, color = "steelblue") +
  geom_hline(yintercept = c(-2, 0, 2), linetype = "dashed", color = "gray50") +
  geom_vline(xintercept = 2*mean(influence_data$hat), linetype = "dashed", color = "gray50") +
  scale_size_continuous(range = c(0.5, 10)) +
  labs(
    title = "Influence Plot (Cook's Distance vs Leverage)",
    x = "Hat-Values (Leverage)",
    y = "Studentized Residuals",
    size = "Cook's D"
  ) +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

Homoskedasticity test

# Homoskedasticity test (Breusch-Pagan)
bptest(model2)


    studentized Breusch-Pagan test

data:  model2
BP = 1361.8, df = 18, p-value < 2.2e-16

Robust standard errors

coeftest(model2, vcov = vcovHC(model2, type = "HC1"))


t test of coefficients:

              Estimate Std. Error   t value  Pr(>|t|)    
(Intercept)  8.4570371  0.0016233 5209.8097 < 2.2e-16 ***
log_carat    1.8837455  0.0012971 1452.3177 < 2.2e-16 ***
cut.L        0.1207374  0.0029211   41.3326 < 2.2e-16 ***
cut.Q       -0.0351415  0.0025280  -13.9008 < 2.2e-16 ***
cut.C        0.0135145  0.0019858    6.8057 1.016e-11 ***
cut^4       -0.0016055  0.0014253   -1.1264   0.26001    
color.L     -0.4395608  0.0021557 -203.9081 < 2.2e-16 ***
color.Q     -0.0957023  0.0019619  -48.7813 < 2.2e-16 ***
color.C     -0.0147601  0.0017917   -8.2379 < 2.2e-16 ***
color^4      0.0118852  0.0016039    7.4103 1.278e-13 ***
color^5     -0.0022186  0.0015053   -1.4739   0.14052    
color^6      0.0022907  0.0013370    1.7133   0.08667 .  
clarity.L    0.9167168  0.0053461  171.4743 < 2.2e-16 ***
clarity.Q   -0.2430149  0.0051072  -47.5830 < 2.2e-16 ***
clarity.C    0.1324298  0.0042271   31.3290 < 2.2e-16 ***
clarity^4   -0.0660911  0.0031009  -21.3137 < 2.2e-16 ***
clarity^5    0.0274741  0.0022230   12.3592 < 2.2e-16 ***
clarity^6   -0.0018098  0.0017524   -1.0328   0.30172    
clarity^7    0.0335550  0.0014129   23.7493 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Shapiro-Wilk normality test

# Normality of residuals (Shapiro test on sample)
set.seed(123)
sample_resid <- sample(resid(model3), 5000)
shapiro.test(sample_resid)


    Shapiro-Wilk normality test

data:  sample_resid
W = 0.96887, p-value < 2.2e-16

Cross-validation (10-fold)

# Cross-validation
train_control <- trainControl(
  method = "cv",       # "cv" = cross-validation
  number = 10,         # 10 katlı
  verboseIter = FALSE  # eğitim sürecini göstermesin
)

cv_model <- train(
  log_price ~ log_carat + cut + color + clarity, data = diamonds_clean,
  method = "lm",
  trControl = train_control)

Actual vs Predicted Prices

actual_vs_predicted <- data.frame(
  Actual_Price = diamonds_clean$price,
  Predicted_Price = exp(predict(model2, newdata = diamonds_clean))
)

ggplot(actual_vs_predicted, aes(x = Actual_Price, y = Predicted_Price)) +
  geom_point(alpha = 0.3, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  scale_x_log10(labels = comma) +
  scale_y_log10(labels = comma) +
  labs(title = "Actual vs Predicted Prices") +
  theme_minimal()

Model Evaluation Metrics

library(Metrics)


Attaching package: 'Metrics'

The following objects are masked from 'package:caret':

    precision, recall

rmse_value <- rmse(actual_vs_predicted$Actual_Price, actual_vs_predicted$Predicted_Price)
r2_value <- cor(actual_vs_predicted$Actual_Price, actual_vs_predicted$Predicted_Price)^2
mae_value <- mae(actual_vs_predicted$Actual_Price, actual_vs_predicted$Predicted_Price)

data.frame(
  Metric = c("RMSE", "R²", "MAE"),
  Value = c(rmse_value, r2_value, mae_value)
)

  Metric      Value
1   RMSE 799.318172
2     R²   0.959949
3    MAE 403.625421

Variance Inflation Factor (VIF) — Multicollinearity Check

vif(model2)

              GVIF Df GVIF^(1/(2*Df))
log_carat 1.312871  1        1.145806
cut       1.105702  4        1.012639
color     1.139994  6        1.010978
clarity   1.323472  7        1.020220