── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.3.0
✔ purrr 1.1.0 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(hexbin)library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
library(ggridges)library(janitor)
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
library(dplyr)library(broom)library(car)
Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
library(caret)
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
library(ggcorrplot)library(lmtest)
Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
ggplot(diamonds, aes(x = carat, y = price)) +geom_point(alpha =0.3) +geom_smooth(method ="gam", formula = y ~s(x), se =FALSE, color ="red") +labs(title ="Price - Carat",x ="Carat",y ="Price")
Price vs. Carat(Heatmap)
#This heat map visualizes the relationship between diamond carat and price where color intensity represents the number of diamonds in each area.ggplot(diamonds, aes(carat, price)) +geom_hex(bins =40) +scale_y_log10(labels=scales::comma) +labs(title="Price ~ Carat (hexbin, log10 price)") +guides(fill=guide_colorbar(title="Count"))
Distribution of Numerical Variables
#Purpose: See skewness, outliers, and typical ranges.diamonds %>%select(price, carat, depth, table, x, y, z) %>%pivot_longer(everything()) %>%ggplot(aes(value)) +geom_histogram(bins =40) +facet_wrap(~name, scales ="free") +labs(title="Distributions of Numerical Variables")
Cleaning Data
# Remove potential outliers (x,y,z = 0 are invalid)diamonds_clean <- diamonds %>%filter(x >0, y >0, z >0)# Log-transform price and carat to handle right-skewed distributionsdiamonds_clean <- diamonds_clean %>%mutate(log_price =log(price),log_carat =log(carat))
# A tibble: 7 × 6
Variable Mean Median SD Min Max
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 carat 0.8 0.7 0.47 0.2 5.01
2 depth 61.8 61.8 1.43 43 79
3 table 57.5 57 2.23 43 95
4 x 5.73 5.7 1.12 0 10.7
5 y 5.73 5.71 1.14 0 58.9
6 z 3.54 3.53 0.71 0 31.8
7 price 3933. 2401 3989. 326 18823
Log(Price)
# Distribution of log(price)ggplot(diamonds_clean, aes(x = log_price)) +geom_histogram(bins =40, fill ="steelblue", color ="white") +labs(title ="Distribution of log(Price)", x ="log(Price)", y ="Count")
Correlation Heatmap of Numerical Data
# Correlation matrix for numerical variablesnum_vars <- diamonds_clean %>%select(carat, depth, table, x, y, z, price)corr <-cor(num_vars)ggcorrplot(corr, lab =TRUE, title ="Correlation Heatmap of Numerical Variables")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
Models
# Step-by-step model construction# Baseline model: log(price) ~ log(carat)model1 <-lm(log_price ~ log_carat, data = diamonds_clean)# Add cut, color, clarity as categorical predictorsmodel2 <-lm(log_price ~ log_carat + cut + color + clarity, data = diamonds_clean)# Full model: add geometric featuresmodel3 <-lm(log_price ~ log_carat + cut + color + clarity + depth + table, data = diamonds_clean)# Get tidy summaryresults <-tidy(model2)print(results)
model <-lm(log(price) ~log(carat) + cut + color + clarity, data = diamonds)# Kalıntıları (residuals) çıkarresiduals_df <-data.frame(Residuals =resid(model))# Yoğunluk grafiği (density plot)ggplot(residuals_df, aes(x = Residuals)) +geom_density(fill ="steelblue", alpha =0.6) +geom_vline(xintercept =0, color ="red", linetype ="dashed") +labs(title ="Residual Density Plot",x ="Residuals (log-price scale)",y ="Density" ) +theme_minimal(base_size =13)
Cook’s Distance
model <-lm(log(price) ~log(carat) + cut + color + clarity, data = diamonds)# Influence plot (Cook's Distance vs Leverage)influencePlot(model, id.method ="identify", main ="Influence Plot (Cook's Distance vs Leverage)",sub ="Circle size represents Cook's Distance")
Warning in plot.window(...): "id.method" is not a graphical parameter
Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
a graphical parameter
Warning in box(...): "id.method" is not a graphical parameter
Warning in title(...): "id.method" is not a graphical parameter
Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
graphical parameter