Untitled

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

# Step 1: Import a dataset and create a summary statistics table ---
# I am using the built-in diamonds dataset
df <- diamonds

cat("Summary Statistics of the Raw Data:\n")

Summary Statistics of the Raw Data:

print(summary(df))

     carat               cut        color        clarity          depth      
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
 Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
                                    J: 2808   (Other): 2531                  
     table           price             x                y         
 Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
 1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
 Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
 Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
 3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
 Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
                                                                  
       z         
 Min.   : 0.000  
 1st Qu.: 2.910  
 Median : 3.530  
 Mean   : 3.539  
 3rd Qu.: 4.040  
 Max.   :31.800

#Step 2: Create a new dataframe called 'clean_df' 
# goal is to eliminate irrelevant variables and fix missing values.

clean_df <- df %>%
  filter(x > 0, y > 0, z > 0)
# Use na.omit to remove any rows with missing values (NA) that may exist
clean_df <- na.omit(clean_df)

cat("\nDimensions of the cleaned dataframe (clean_df):\n")


Dimensions of the cleaned dataframe (clean_df):

print(dim(clean_df))

[1] 53920    10

cat("\nSummary of the Cleaned Data:\n")


Summary of the Cleaned Data:

print(summary(clean_df))

     carat               cut        color        clarity          depth      
 Min.   :0.2000   Fair     : 1609   D: 6774   SI1    :13063   Min.   :43.00  
 1st Qu.:0.4000   Good     : 4902   E: 9797   VS2    :12254   1st Qu.:61.00  
 Median :0.7000   Very Good:12081   F: 9538   SI2    : 9185   Median :61.80  
 Mean   :0.7977   Premium  :13780   G:11284   VS1    : 8170   Mean   :61.75  
 3rd Qu.:1.0400   Ideal    :21548   H: 8298   VVS2   : 5066   3rd Qu.:62.50  
 Max.   :5.0100                     I: 5421   VVS1   : 3654   Max.   :79.00  
                                    J: 2808   (Other): 2528                  
     table           price             x                y         
 Min.   :43.00   Min.   :  326   Min.   : 3.730   Min.   : 3.680  
 1st Qu.:56.00   1st Qu.:  949   1st Qu.: 4.710   1st Qu.: 4.720  
 Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
 Mean   :57.46   Mean   : 3931   Mean   : 5.732   Mean   : 5.735  
 3rd Qu.:59.00   3rd Qu.: 5323   3rd Qu.: 6.540   3rd Qu.: 6.540  
 Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
                                                                  
       z        
 Min.   : 1.07  
 1st Qu.: 2.91  
 Median : 3.53  
 Mean   : 3.54  
 3rd Qu.: 4.04  
 Max.   :31.80

#Step 3: Run a 'kitchen sink' model on clean_df 
# The dependent variable is 'price'.
# All other columns in clean_df will be independent variables.

independent_vars <- setdiff(names(clean_df), "price")

kitchen_sink_formula <- as.formula(
  paste("price ~", paste(independent_vars, collapse = " + "))
)

cat("\nKitchen Sink Model Formula:\n")


Kitchen Sink Model Formula:

print(kitchen_sink_formula)

price ~ carat + cut + color + clarity + depth + table + x + y + 
    z

kitchen_sink_model <- lm(kitchen_sink_formula, data = clean_df)

cat("\nSummary of the Kitchen Sink Model:\n")


Summary of the Kitchen Sink Model:

print(summary(kitchen_sink_model))


Call:
lm(formula = kitchen_sink_formula, data = clean_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-21888.2   -586.5   -183.8    370.7  10734.4 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  6279.499    402.523   15.600  < 2e-16 ***
carat       11525.671     51.630  223.235  < 2e-16 ***
cut.L         578.181     22.431   25.776  < 2e-16 ***
cut.Q        -297.280     17.960  -16.553  < 2e-16 ***
cut.C         147.884     15.451    9.571  < 2e-16 ***
cut^4         -21.592     12.348   -1.749  0.08035 .  
color.L     -1958.471     17.305 -113.176  < 2e-16 ***
color.Q      -679.765     15.745  -43.174  < 2e-16 ***
color.C      -164.263     14.689  -11.183  < 2e-16 ***
color^4        41.555     13.495    3.079  0.00208 ** 
color^5       -97.374     12.747   -7.639 2.22e-14 ***
color^6       -48.196     11.588   -4.159 3.20e-05 ***
clarity.L    4085.291     30.227  135.156  < 2e-16 ***
clarity.Q   -1936.744     28.205  -68.667  < 2e-16 ***
clarity.C     990.568     24.125   41.060  < 2e-16 ***
clarity^4    -366.867     19.254  -19.054  < 2e-16 ***
clarity^5     237.369     15.720   15.100  < 2e-16 ***
clarity^6       5.298     13.683    0.387  0.69858    
clarity^7      88.931     12.075    7.365 1.80e-13 ***
depth         -65.077      4.639  -14.029  < 2e-16 ***
table         -26.444      2.905   -9.102  < 2e-16 ***
x           -1100.681     34.993  -31.455  < 2e-16 ***
y              25.933     19.447    1.334  0.18236    
z            -114.981     37.840   -3.039  0.00238 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1127 on 53896 degrees of freedom
Multiple R-squared:  0.9201,    Adjusted R-squared:  0.9201 
F-statistic: 2.699e+04 on 23 and 53896 DF,  p-value: < 2.2e-16

cat("\nPerforming Backward Selection...\n")


Performing Backward Selection...

backward_selection_model <- stepAIC(kitchen_sink_model, direction = "backward")

Start:  AIC=757873.4
price ~ carat + cut + color + clarity + depth + table + x + y + 
    z

          Df  Sum of Sq        RSS    AIC
- y        1 2.2597e+06 6.8487e+10 757873
<none>                  6.8485e+10 757873
- z        1 1.1733e+07 6.8497e+10 757881
- table    1 1.0526e+08 6.8590e+10 757954
- depth    1 2.5008e+08 6.8735e+10 758068
- cut      4 8.4437e+08 6.9329e+10 758526
- x        1 1.2572e+09 6.9742e+10 758852
- color    6 1.7201e+10 8.5686e+10 769944
- clarity  7 3.5365e+10 1.0385e+11 780308
- carat    1 6.3324e+10 1.3181e+11 793175

Step:  AIC=757873.2
price ~ carat + cut + color + clarity + depth + table + x + z

          Df  Sum of Sq        RSS    AIC
<none>                  6.8487e+10 757873
- z        1 1.0483e+07 6.8498e+10 757879
- table    1 1.0584e+08 6.8593e+10 757954
- depth    1 2.5718e+08 6.8745e+10 758073
- cut      4 8.4554e+08 6.9333e+10 758527
- x        1 1.5073e+09 6.9995e+10 759045
- color    6 1.7202e+10 8.5689e+10 769943
- clarity  7 3.5376e+10 1.0386e+11 780313
- carat    1 6.3359e+10 1.3185e+11 793188

cat("\nSummary of the Backward Selection Model:\n")


Summary of the Backward Selection Model:

print(summary(backward_selection_model))


Call:
lm(formula = price ~ carat + cut + color + clarity + depth + 
    table + x + z, data = clean_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-21894.2   -586.3   -183.7    370.7  10734.0 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  6322.934    401.205   15.760  < 2e-16 ***
carat       11526.937     51.622  223.296  < 2e-16 ***
cut.L         578.563     22.430   25.795  < 2e-16 ***
cut.Q        -298.087     17.950  -16.607  < 2e-16 ***
cut.C         148.978     15.430    9.655  < 2e-16 ***
cut^4         -21.124     12.343   -1.711  0.08700 .  
color.L     -1958.496     17.305 -113.176  < 2e-16 ***
color.Q      -679.789     15.745  -43.176  < 2e-16 ***
color.C      -164.256     14.689  -11.182  < 2e-16 ***
color^4        41.540     13.495    3.078  0.00208 ** 
color^5       -97.330     12.747   -7.636 2.28e-14 ***
color^6       -48.162     11.588   -4.156 3.24e-05 ***
clarity.L    4085.868     30.224  135.188  < 2e-16 ***
clarity.Q   -1937.110     28.204  -68.683  < 2e-16 ***
clarity.C     990.895     24.124   41.075  < 2e-16 ***
clarity^4    -367.035     19.253  -19.063  < 2e-16 ***
clarity^5     237.559     15.719   15.112  < 2e-16 ***
clarity^6       5.265     13.683    0.385  0.70040    
clarity^7      88.891     12.075    7.362 1.84e-13 ***
depth         -65.680      4.617  -14.227  < 2e-16 ***
table         -26.513      2.905   -9.127  < 2e-16 ***
x           -1079.969     31.357  -34.441  < 2e-16 ***
z            -107.476     37.419   -2.872  0.00408 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1127 on 53897 degrees of freedom
Multiple R-squared:  0.9201,    Adjusted R-squared:  0.9201 
F-statistic: 2.821e+04 on 22 and 53897 DF,  p-value: < 2.2e-16

# To see which variables were removed, you can compare the final formula to the kitchen sink formula
cat("\nFinal model formula after backward selection:\n")


Final model formula after backward selection:

print(formula(backward_selection_model))

price ~ carat + cut + color + clarity + depth + table + x + z