members <- c("Utkarsh Ojha") # Edit this
roll_number <- c("MBAA23077")
## DON'T EDIT FOLLOWING CODE
group <- data.frame(members, roll_number)
kbl(group, align = "c", col.names = c("Name", "Roll Number")) %>%
kable_classic(full_width = F) %>% kable_material()
| Name | Roll Number |
|---|---|
| Utkarsh Ojha | MBAA23077 |
A large Indian retail chain has stores across 3 states in India: Maharashtra, Telangana and Kerala. These stores stock products across various categories such as FMCG (fast moving consumer goods), eatables / perishables and others. Managing the inventory is crucial for the revenue stream of the retail chain. Meeting the demand is important to not lose potential revenue, while at the same time stocking excessive products could lead to losses.
The information provided above, along with all essential data files,
was collected from kaggle.com.
These data files have been organized within the data subfolder
located under the project folder. You can access these files by
executing the command read.csv("./data/filename.csv").
Variables in train_data.csv:
date : The date for which the observation was
recorded
product_identifier : The id for a product
department_identifier : The id for a specific
department in a store
category_of_product : The category to which a
product belongs
outlet : The id for a store
state : The name of the state
sales : The number of sales for the product
Auxiliary Datasets:
product_prices.csv : The prices of products at each
store for each week
date_to_week_id_map.csv : The mapping from a date to
the week_id
Showcase at least five regression techniques for predicting sales. Determine which method performs better and explain why it is superior. The process you should follow to obtain these results involves a structured pipeline consisting of Exploratory Data Analysis (EDA), preprocessing, model building, and an evaluation strategy. Additionally, interpret model(s) such a way that illustrate which variables affect sales.
#setwd(“C:/Users/HP/OneDrive/Desktop/ADA/course-project/course-project”)
#Loading the datasets#
train_data <- read.csv("./data/train_data.csv")
product_prices <- read.csv("./data/product_prices.csv")
date_to_week_id <- read.csv("./data/date_to_week_id_map.csv")
#Examining the structure of the datasets#
head(train_data)
## date product_identifier department_identifier
## 1 2012-01-01 74 11
## 2 2012-01-01 337 11
## 3 2012-01-01 423 12
## 4 2012-01-01 432 12
## 5 2012-01-01 581 21
## 6 2012-01-01 611 21
## category_of_product outlet state sales
## 1 others 111 Maharashtra 0
## 2 others 111 Maharashtra 1
## 3 others 111 Maharashtra 0
## 4 others 111 Maharashtra 0
## 5 fast_moving_consumer_goods 111 Maharashtra 0
## 6 fast_moving_consumer_goods 111 Maharashtra 0
head(product_prices)
## outlet product_identifier week_id sell_price
## 1 111 74 49 2.94
## 2 111 74 50 2.94
## 3 111 74 51 2.94
## 4 111 74 52 2.94
## 5 111 74 53 2.94
## 6 111 74 54 2.94
head(date_to_week_id)
## date week_id
## 1 2012-01-01 49
## 2 2012-01-02 49
## 3 2012-01-03 49
## 4 2012-01-04 49
## 5 2012-01-05 49
## 6 2012-01-06 49
#Checking column data types#
print("Data Types for Each Dataset:")
## [1] "Data Types for Each Dataset:"
str(train_data)
## 'data.frame': 395000 obs. of 7 variables:
## $ date : chr "2012-01-01" "2012-01-01" "2012-01-01" "2012-01-01" ...
## $ product_identifier : int 74 337 423 432 581 611 631 659 743 797 ...
## $ department_identifier: int 11 11 12 12 21 21 21 21 21 21 ...
## $ category_of_product : chr "others" "others" "others" "others" ...
## $ outlet : int 111 111 111 111 111 111 111 111 111 111 ...
## $ state : chr "Maharashtra" "Maharashtra" "Maharashtra" "Maharashtra" ...
## $ sales : int 0 1 0 0 0 0 0 0 0 0 ...
str(product_prices)
## 'data.frame': 59000 obs. of 4 variables:
## $ outlet : int 111 111 111 111 111 111 111 111 111 111 ...
## $ product_identifier: int 74 74 74 74 74 74 74 74 74 74 ...
## $ week_id : int 49 50 51 52 53 54 55 56 57 58 ...
## $ sell_price : num 2.94 2.94 2.94 2.94 2.94 2.94 2.94 2.94 2.94 2.94 ...
str(date_to_week_id)
## 'data.frame': 821 obs. of 2 variables:
## $ date : chr "2012-01-01" "2012-01-02" "2012-01-03" "2012-01-04" ...
## $ week_id: int 49 49 49 49 49 49 50 50 50 50 ...
#Merge train_data with date_to_week_id#
train_with_week <- train_data %>%
left_join(date_to_week_id, by = "date")
#Merge the above with product_prices#
merged_data <- train_with_week %>%
left_join(product_prices, by =
c("product_identifier", "outlet", "week_id"))
#Check for missing values in the merged dataset#
print("Missing Values in the Complete Dataset:")
## [1] "Missing Values in the Complete Dataset:"
sapply(merged_data, function(x) sum(is.na(x)))
## date product_identifier department_identifier
## 0 0 0
## category_of_product outlet state
## 0 0 0
## sales week_id sell_price
## 0 0 0
#There are 0 missing value in all the three datasets#
merged_data $date <- as.Date(merged_data $date, format = "%Y-%m-%d")
#Extract day of the week#
merged_data $day_of_week <- weekdays(merged_data $date)
#Extract month from the date#
merged_data$month <- as.numeric(format(as.Date(merged_data$date), "%m"))
#Summary Statistics#
print("Summary of Merged Data:")
## [1] "Summary of Merged Data:"
summary(merged_data)
## date product_identifier department_identifier
## Min. :2012-01-01 Min. : 74 Min. :11.00
## 1st Qu.:2012-07-16 1st Qu.: 926 1st Qu.:21.00
## Median :2013-01-29 Median :1325 Median :22.00
## Mean :2013-01-29 Mean :1510 Mean :24.46
## 3rd Qu.:2013-08-15 3rd Qu.:1753 3rd Qu.:31.00
## Max. :2014-02-28 Max. :3021 Max. :33.00
## category_of_product outlet state sales
## Length:395000 Min. :111.0 Length:395000 Min. : 0.000
## Class :character 1st Qu.:113.0 Class :character 1st Qu.: 0.000
## Mode :character Median :221.5 Mode :character Median : 0.000
## Mean :211.2 Mean : 1.229
## 3rd Qu.:331.0 3rd Qu.: 1.000
## Max. :333.0 Max. :293.000
## week_id sell_price day_of_week month
## Min. : 49.0 Min. : 0.050 Length:395000 Min. : 1.000
## 1st Qu.: 77.0 1st Qu.: 2.680 Class :character 1st Qu.: 3.000
## Median :105.0 Median : 3.980 Mode :character Median : 6.000
## Mean :105.1 Mean : 4.988 Mean : 6.143
## 3rd Qu.:133.0 3rd Qu.: 6.480 3rd Qu.: 9.000
## Max. :161.0 Max. :44.360 Max. :12.000
#Display the structure of the merged dataset#
print("Structure of Complete Dataset:")
## [1] "Structure of Complete Dataset:"
str(merged_data)
## 'data.frame': 395000 obs. of 11 variables:
## $ date : Date, format: "2012-01-01" "2012-01-01" ...
## $ product_identifier : int 74 337 423 432 581 611 631 659 743 797 ...
## $ department_identifier: int 11 11 12 12 21 21 21 21 21 21 ...
## $ category_of_product : chr "others" "others" "others" "others" ...
## $ outlet : int 111 111 111 111 111 111 111 111 111 111 ...
## $ state : chr "Maharashtra" "Maharashtra" "Maharashtra" "Maharashtra" ...
## $ sales : int 0 1 0 0 0 0 0 0 0 0 ...
## $ week_id : int 49 49 49 49 49 49 49 49 49 49 ...
## $ sell_price : num 2.94 7.44 0.97 4.97 4.88 2.84 6.97 3.97 4.47 2.64 ...
## $ day_of_week : chr "Sunday" "Sunday" "Sunday" "Sunday" ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
#Take a random sample of 5000 observations from the ‘sell_price’and ’sales’ column#
sell_price_sample <- sample(merged_data$sell_price, 5000)
sales_sample <- sample(merged_data$sales, 5000)
#Perform the Shapiro-Wilk test on the sample#
shapiro.test(sell_price_sample)
##
## Shapiro-Wilk normality test
##
## data: sell_price_sample
## W = 0.68533, p-value < 0.00000000000000022
shapiro.test(sales_sample)
##
## Shapiro-Wilk normality test
##
## data: sales_sample
## W = 0.38608, p-value < 0.00000000000000022
#Interpretation of Shapiro’s Normality Test#:
Both samples, sell_price_sample and sales_sample, exhibit strong evidence against the null hypothesis of normality based on the Shapiro-Wilk test.
The extremely low p-values suggest that the distributions of both samples deviate significantly from a normal distribution.
This information is relevant when considering statistical methods that assume normality, as deviations from normality may affect the validity of certain analyses.
Hence, we may conclude that both sell_price and sales are significantly different from normal distribution.
#Spearman’s correlation coefficient#
cor.test(merged_data$sell_price, merged_data$sales, exact = FALSE, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: merged_data$sell_price and merged_data$sales
## S = 13328113088461144, p-value < 0.00000000000000022
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2975635
#Interpretation of Spearman’s correlation coefficient#:
The negative Spearman’s rank correlation coefficient (rho = -0.2975635) suggests a moderate negative monotonic relationship between sell_price and sales.
The extremely small p-value (< 0.00000000000000022) indicates strong evidence against the null hypothesis, suggesting that there is a significant Spearman’s rank correlation between sell_price and sales.
The alternative hypothesis is supported, indicating that the true Spearman’s rank correlation between sell_price and sales is not equal to 0.
The large test statistic (S) contributes to the rejection of the null hypothesis
columns_to_factor <- c("day_of_week","department_identifier","category_of_product","state","product_identifier", "outlet", "week_id" , "day_of_week", "month")
#Convert specified columns to factors#
merged_data <- merged_data %>%
mutate(across(all_of(columns_to_factor), as.factor))
#Check levels of categorical variables#
sapply(merged_data[, sapply(merged_data, is.factor)], levels)
## $product_identifier
## [1] "74" "337" "423" "432" "581" "611" "631" "659" "743" "797"
## [11] "868" "904" "926" "972" "973" "1054" "1135" "1173" "1190" "1196"
## [21] "1228" "1240" "1242" "1275" "1322" "1328" "1365" "1424" "1472" "1508"
## [31] "1542" "1548" "1599" "1629" "1672" "1694" "1727" "1753" "2294" "2332"
## [41] "2492" "2768" "2794" "2818" "2853" "2932" "2935" "3004" "3008" "3021"
##
## $department_identifier
## [1] "11" "12" "21" "22" "31" "33"
##
## $category_of_product
## [1] "drinks_and_food" "fast_moving_consumer_goods"
## [3] "others"
##
## $outlet
## [1] "111" "112" "113" "114" "221" "222" "223" "331" "332" "333"
##
## $state
## [1] "Kerala" "Maharashtra" "Telangana"
##
## $week_id
## [1] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [13] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [25] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [37] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [49] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [61] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [73] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [85] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [97] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [109] "157" "158" "159" "160" "161"
##
## $day_of_week
## [1] "Friday" "Monday" "Saturday" "Sunday" "Thursday" "Tuesday"
## [7] "Wednesday"
##
## $month
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
Factors are categorical variables in R, used to represent discrete groups or levels. Converting relevant columns to factors helps R recognize these columns for tasks like statistical analysis, modeling, and visualization.
I’ve factor a few columns or variables of our
merged_data dataframe. These columns are
day_of_week, department_identifier,
category_of_product, state,
product_identifier, outlet,
week_id and day_of_week, ‘month’
These columns are factored to prepare them as input (predictor or outcome) for our regression models as most of these columns use numbers as their values and these numbers are nominal in nature and are used as ids to identify certain objects. The character columns when converted into factors make them usable for our regression models as character datatype variables in general cannot be used in any of the regression models
#1 Histogram for ‘sales’ column#
hist(merged_data$sales, main="Histogram of Sales", xlab="Sales", col="skyblue", border="black")
#Interpretation of the Sales Histogram:#
We can notice that the sales is extremely skewed on the right. This means most of the transactions are lower values. In our case, More than half of our data has sales with “0” number of products sold.
#2 Density Plot for ‘sell_price’ column#
merged_data %>%
ggplot(aes(x=sell_price)) +
geom_density(fill="green", color="#e9ecef", alpha=100,main="Density Plot of Sell Price", xlab="Sell Price", border="black")
#Interpretation of the Selling Price Density Plot:#
We can notice that the selling price is extremely skewed on the right. This means most of the price values are lower values. In our case, Majority of our product prices have a value less than 10.
#3 Sales across Days#
ggplot(merged_data, aes(x = day_of_week, y = sales)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Bar Plot of Total Sales by Day of the Week",
x = "Day of the Week",
y = "Total Sales") +
theme_minimal()
#Interpretation of the Sales across Days of the week Bar Plot:#
The bar plot shows that sales peak on weekends, with Saturday and Sunday having the highest total sales. Sales are relatively consistent but lower on weekdays, indicating stronger consumer activity during the weekend.
#4 Sales across category of product and grouped by state#
merged_data %>%
ggplot(aes(x = category_of_product, y = sales, fill = factor(state))) +
geom_bar(stat = "Identity", position = "dodge")
#Interpretation of the Sales across category of product and grouped by
state Multi-Bar Plot:#
The drinks_and_food category has the most number of
products sold followed by
fast_moving_consumer_goods
The state of Telangana tops the sales in
drinks_and_food category followed by Kerala.
Unlike in the drinks_and_food category, teh state of
Maharastra has most of the products sold under the
fast_moving_consumer_goods catergory.
#5 Sales across outlets#
merged_data %>%
ggplot(aes(x = factor(outlet), y = sales, fill = factor(department_identifier))) +
geom_bar(stat = "Identity")
#Interpretation of the Sales across outlets Stacked Bar Plot:#
We can notice the highest number of products sold by outlet 113. On the contrary, the least number of products sold was made by outlet 114.
Across all of the outlets, department 33 makes most of the sales, whereas, department 11 makes the least of the sales.
#6 Product-wise Sales#
product_wise_sales <- merged_data %>%
group_by(product_identifier) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales))
#Top 10 Products for clarity#
top_products <- head(product_wise_sales, 10)
#Plot Product-wise Sales (Top 10 Products)#
ggplot(top_products, aes(x = factor(product_identifier), y = total_sales, fill = product_identifier)) +
geom_bar(stat = "identity") +
ggtitle("Top 10 Product-wise Sales") +
xlab("Product Identifier") + ylab("Total Sales") +
theme_minimal()
#Split data into train and test sets#
set.seed(16)
train_index <- createDataPartition(merged_data$sales, p = 0.8, list = FALSE)
train_data <- merged_data[train_index, ]
test_data <- merged_data[-train_index, ]
mlm<- lm(sales ~ sell_price + state + category_of_product, data = train_data)
summary(mlm)
##
## Call:
## lm(formula = sales ~ sell_price + state + category_of_product,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.760 -1.085 -0.598 0.276 290.490
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2.614918 0.014977 174.595
## sell_price -0.120169 0.001795 -66.942
## stateMaharashtra 0.169118 0.014929 11.328
## stateTelangana -0.080570 0.015963 -5.047
## category_of_productfast_moving_consumer_goods -1.222456 0.015014 -81.420
## category_of_productothers -1.499632 0.024431 -61.382
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## sell_price < 0.0000000000000002 ***
## stateMaharashtra < 0.0000000000000002 ***
## stateTelangana 0.000000448 ***
## category_of_productfast_moving_consumer_goods < 0.0000000000000002 ***
## category_of_productothers < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.475 on 315994 degrees of freedom
## Multiple R-squared: 0.06342, Adjusted R-squared: 0.0634
## F-statistic: 4279 on 5 and 315994 DF, p-value: < 0.00000000000000022
#Predict on training and test data#
test_predictions <- predict(mlm, newdata = test_data)
#RMSE Calculation Function#
calculate_rmse <- function(true_values, predictions) {
sqrt(mean((true_values - predictions)^2))
}
#Calculating Test RMSE#
test_rmse <- calculate_rmse(test_data$sales, test_predictions)
test_R2 <- R2(test_predictions, test_data$sales)
#Print results#
cat("Multiple regression Test RMSE:", test_rmse, "\n")
## Multiple regression Test RMSE: 3.500064
cat("Multiple regression Test R2:", test_R2, "\n")
## Multiple regression Test R2: 0.06228829
#Automatic BP test#
#Null: Homoskedasticity #
bptest(mlm)
##
## studentized Breusch-Pagan test
##
## data: mlm
## BP = 873.73, df = 5, p-value < 0.00000000000000022
This indicates strong evidence of heteroskedasticity in the model, meaning that the variance of the residuals is not constant across all levels of the independent variables.
#White heteroscedasticity-robust SE:#
coeftest(mlm, vcov = vcovHC, type="HC0")
##
## t test of coefficients:
##
## Estimate Std. Error t value
## (Intercept) 2.6149177 0.0242801 107.6980
## sell_price -0.1201686 0.0013641 -88.0908
## stateMaharashtra 0.1691179 0.0160100 10.5633
## stateTelangana -0.0805702 0.0166273 -4.8457
## category_of_productfast_moving_consumer_goods -1.2224565 0.0151078 -80.9155
## category_of_productothers -1.4996324 0.0180284 -83.1819
## Pr(>|t|)
## (Intercept) < 0.00000000000000022 ***
## sell_price < 0.00000000000000022 ***
## stateMaharashtra < 0.00000000000000022 ***
## stateTelangana 0.000001262 ***
## category_of_productfast_moving_consumer_goods < 0.00000000000000022 ***
## category_of_productothers < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The heteroskedasticity-robust standard errors and t-tests suggest that the relationships between the predictors (sell_price, state, category_of_product) and the dependent variable (sales) are statistically significant, even after accounting for potential heteroskedasticity
#Building the initial regression tree model#
#Small cp to grow a large tree#
tree_model <-rpart(sales ~ sell_price + state + category_of_product,data = train_data,method = "anova",control = rpart.control(cp = .001))
#Summary of the tree#
printcp(tree_model) # Complexity parameter table
##
## Regression tree:
## rpart(formula = sales ~ sell_price + state + category_of_product,
## data = train_data, method = "anova", control = rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] category_of_product sell_price state
##
## Root node error: 4073665/316000 = 12.891
##
## n= 316000
##
## CP nsplit rel error xerror xstd
## 1 0.1432103 0 1.00000 1.00000 0.040674
## 2 0.0317094 1 0.85679 0.85690 0.036391
## 3 0.0275738 2 0.82508 0.82520 0.036309
## 4 0.0078410 4 0.76993 0.77008 0.036237
## 5 0.0059825 5 0.76209 0.76243 0.035790
## 6 0.0032659 6 0.75611 0.75647 0.035783
## 7 0.0030049 8 0.74958 0.74994 0.035782
## 8 0.0021862 10 0.74357 0.74394 0.035753
## 9 0.0011438 13 0.73701 0.73739 0.035758
## 10 0.0010783 17 0.73243 0.73282 0.035757
## 11 0.0010243 18 0.73136 0.73175 0.035757
## 12 0.0010000 19 0.73033 0.73084 0.035756
summary(tree_model) # Summary of splits
## Call:
## rpart(formula = sales ~ sell_price + state + category_of_product,
## data = train_data, method = "anova", control = rpart.control(cp = 0.001))
## n= 316000
##
## CP nsplit rel error xerror xstd
## 1 0.143210310 0 1.0000000 1.0000023 0.04067354
## 2 0.031709378 1 0.8567897 0.8569023 0.03639112
## 3 0.027573848 2 0.8250803 0.8252030 0.03630897
## 4 0.007841024 4 0.7699326 0.7700788 0.03623745
## 5 0.005982468 5 0.7620916 0.7624304 0.03578976
## 6 0.003265948 6 0.7561091 0.7564697 0.03578325
## 7 0.003004906 8 0.7495772 0.7499387 0.03578165
## 8 0.002186154 10 0.7435674 0.7439407 0.03575300
## 9 0.001143797 13 0.7370090 0.7373926 0.03575785
## 10 0.001078302 17 0.7324338 0.7328249 0.03575724
## 11 0.001024269 18 0.7313555 0.7317471 0.03575675
## 12 0.001000000 19 0.7303312 0.7308389 0.03575648
##
## Variable importance
## sell_price category_of_product state
## 77 19 4
##
## Node number 1: 316000 observations, complexity param=0.1432103
## mean=1.229525, MSE=12.89134
## left son=2 (309311 obs) right son=3 (6689 obs)
## Primary splits:
## sell_price < 0.945 to the right, improve=0.1432103000, (0 missing)
## category_of_product splits as RLL, improve=0.0492470000, (0 missing)
## state splits as LRL, improve=0.0008209971, (0 missing)
##
## Node number 2: 309311 observations, complexity param=0.03170938
## mean=1.029714, MSE=5.73375
## left son=4 (189174 obs) right son=5 (120137 obs)
## Primary splits:
## sell_price < 3.765 to the right, improve=0.07283479, (0 missing)
## category_of_product splits as RLL, improve=0.04995551, (0 missing)
## state splits as LRL, improve=0.00342148, (0 missing)
## Surrogate splits:
## category_of_product splits as RLR, agree=0.774, adj=0.419, (0 split)
##
## Node number 3: 6689 observations, complexity param=0.007841024
## mean=10.46913, MSE=256.6545
## left son=6 (4823 obs) right son=7 (1866 obs)
## Primary splits:
## state splits as RLL, improve=0.0186057800, (0 missing)
## sell_price < 0.56 to the left, improve=0.0003871141, (0 missing)
##
## Node number 4: 189174 observations, complexity param=0.003265948
## mean=0.5147272, MSE=1.277239
## left son=8 (149795 obs) right son=9 (39379 obs)
## Primary splits:
## sell_price < 3.99 to the right, improve=0.049704720, (0 missing)
## category_of_product splits as RLR, improve=0.040092500, (0 missing)
## state splits as LRL, improve=0.006248875, (0 missing)
## Surrogate splits:
## category_of_product splits as RLL, agree=0.816, adj=0.115, (0 split)
##
## Node number 5: 120137 observations, complexity param=0.02757385
## mean=1.84064, MSE=11.67599
## left son=10 (107902 obs) right son=11 (12235 obs)
## Primary splits:
## sell_price < 0.985 to the right, improve=0.041906300, (0 missing)
## category_of_product splits as RRL, improve=0.019664990, (0 missing)
## state splits as LRL, improve=0.003971486, (0 missing)
##
## Node number 6: 4823 observations
## mean=9.10989, MSE=176.449
##
## Node number 7: 1866 observations
## mean=13.98232, MSE=446.8416
##
## Node number 8: 149795 observations, complexity param=0.001143797
## mean=0.3855402, MSE=0.8440621
## left son=16 (137160 obs) right son=17 (12635 obs)
## Primary splits:
## category_of_product splits as LLR, improve=0.025277250, (0 missing)
## sell_price < 4.925 to the right, improve=0.013113790, (0 missing)
## state splits as LRL, improve=0.007706907, (0 missing)
##
## Node number 9: 39379 observations, complexity param=0.003265948
## mean=1.006145, MSE=2.620039
## left son=18 (30817 obs) right son=19 (8562 obs)
## Primary splits:
## sell_price < 3.975 to the left, improve=0.14149880, (0 missing)
## category_of_product splits as RL-, improve=0.04887079, (0 missing)
## state splits as LRL, improve=0.00462707, (0 missing)
## Surrogate splits:
## category_of_product splits as RL-, agree=0.84, adj=0.264, (0 split)
##
## Node number 10: 107902 observations, complexity param=0.003004906
## mean=1.605095, MSE=8.616501
## left son=20 (44456 obs) right son=21 (63446 obs)
## Primary splits:
## sell_price < 2.375 to the left, improve=0.012628710, (0 missing)
## category_of_product splits as RRL, improve=0.008293683, (0 missing)
## state splits as LRR, improve=0.003148500, (0 missing)
## Surrogate splits:
## category_of_product splits as LRR, agree=0.765, adj=0.429, (0 split)
##
## Node number 11: 12235 observations, complexity param=0.02757385
## mean=3.91794, MSE=33.8535
## left son=22 (6316 obs) right son=23 (5919 obs)
## Primary splits:
## category_of_product splits as R-L, improve=0.40046220, (0 missing)
## sell_price < 0.975 to the left, improve=0.40046220, (0 missing)
## state splits as RRL, improve=0.03797706, (0 missing)
## Surrogate splits:
## sell_price < 0.975 to the left, agree=1.000, adj=1.000, (0 split)
## state splits as RRL, agree=0.518, adj=0.004, (0 split)
##
## Node number 16: 137160 observations, complexity param=0.001143797
## mean=0.3412073, MSE=0.6961176
## left son=32 (114476 obs) right son=33 (22684 obs)
## Primary splits:
## sell_price < 4.925 to the right, improve=0.0245011400, (0 missing)
## state splits as LRL, improve=0.0120270700, (0 missing)
## category_of_product splits as RL-, improve=0.0001249388, (0 missing)
##
## Node number 17: 12635 observations, complexity param=0.001143797
## mean=0.8667986, MSE=2.197137
## left son=34 (6338 obs) right son=35 (6297 obs)
## Primary splits:
## sell_price < 6.235 to the left, improve=0.21656360, (0 missing)
## state splits as RLL, improve=0.03067916, (0 missing)
## Surrogate splits:
## state splits as LLR, agree=0.502, adj=0, (0 split)
##
## Node number 18: 30817 observations
## mean=0.6852062, MSE=1.318207
##
## Node number 19: 8562 observations
## mean=2.161294, MSE=5.60059
##
## Node number 20: 44456 observations, complexity param=0.001078302
## mean=1.211018, MSE=4.670583
## left son=40 (17006 obs) right son=41 (27450 obs)
## Primary splits:
## sell_price < 2.04 to the right, improve=0.02115554000, (0 missing)
## state splits as LRL, improve=0.00914348500, (0 missing)
## category_of_product splits as RRL, improve=0.00003805224, (0 missing)
##
## Node number 21: 63446 observations, complexity param=0.003004906
## mean=1.881222, MSE=11.19631
## left son=42 (6353 obs) right son=43 (57093 obs)
## Primary splits:
## category_of_product splits as RRL, improve=0.017935340, (0 missing)
## sell_price < 2.73 to the right, improve=0.015468030, (0 missing)
## state splits as LRR, improve=0.003656488, (0 missing)
##
## Node number 22: 6316 observations
## mean=0.3535465, MSE=0.7146185
##
## Node number 23: 5919 observations, complexity param=0.005982468
## mean=7.721406, MSE=41.19169
## left son=46 (1475 obs) right son=47 (4444 obs)
## Primary splits:
## state splits as RRL, improve=0.09995573, (0 missing)
##
## Node number 32: 114476 observations
## mean=0.2830724, MSE=0.4648489
##
## Node number 33: 22684 observations, complexity param=0.001143797
## mean=0.6345883, MSE=1.7601
## left son=66 (16395 obs) right son=67 (6289 obs)
## Primary splits:
## sell_price < 4.825 to the left, improve=0.177590200, (0 missing)
## state splits as LRR, improve=0.009905091, (0 missing)
## category_of_product splits as LR-, improve=0.009812318, (0 missing)
##
## Node number 34: 6338 observations
## mean=0.1792364, MSE=0.2449333
##
## Node number 35: 6297 observations
## mean=1.558838, MSE=3.207313
##
## Node number 40: 17006 observations, complexity param=0.001024269
## mean=0.8116547, MSE=2.976698
## left son=80 (6320 obs) right son=81 (10686 obs)
## Primary splits:
## sell_price < 2.13 to the left, improve=0.08242560000, (0 missing)
## state splits as LRL, improve=0.00798616100, (0 missing)
## category_of_product splits as RL-, improve=0.00007882467, (0 missing)
##
## Node number 41: 27450 observations
## mean=1.458434, MSE=5.559966
##
## Node number 42: 6353 observations
## mean=0.5378561, MSE=1.076837
##
## Node number 43: 57093 observations, complexity param=0.002186154
## mean=2.030704, MSE=12.09919
## left son=86 (38217 obs) right son=87 (18876 obs)
## Primary splits:
## sell_price < 2.73 to the right, improve=0.009825145, (0 missing)
## state splits as LRR, improve=0.003535941, (0 missing)
## category_of_product splits as RL-, improve=0.002493100, (0 missing)
##
## Node number 46: 1475 observations
## mean=4.199322, MSE=20.10671
##
## Node number 47: 4444 observations
## mean=8.890414, MSE=42.70604
##
## Node number 66: 16395 observations
## mean=0.2883196, MSE=0.5197995
##
## Node number 67: 6289 observations
## mean=1.537287, MSE=3.866037
##
## Node number 80: 6320 observations
## mean=0.1675633, MSE=0.1989795
##
## Node number 81: 10686 observations
## mean=1.192588, MSE=4.229052
##
## Node number 86: 38217 observations, complexity param=0.002186154
## mean=1.788393, MSE=12.53635
## left son=172 (29054 obs) right son=173 (9163 obs)
## Primary splits:
## sell_price < 3.275 to the left, improve=0.01781566000, (0 missing)
## state splits as LRR, improve=0.00289438400, (0 missing)
## category_of_product splits as LR-, improve=0.00006016325, (0 missing)
##
## Node number 87: 18876 observations
## mean=2.521297, MSE=10.85455
##
## Node number 172: 29054 observations, complexity param=0.002186154
## mean=1.522992, MSE=11.81917
## left son=344 (6564 obs) right son=345 (22490 obs)
## Primary splits:
## sell_price < 2.99 to the right, improve=0.0331818600, (0 missing)
## category_of_product splits as RL-, improve=0.0033850990, (0 missing)
## state splits as LRR, improve=0.0008428921, (0 missing)
##
## Node number 173: 9163 observations
## mean=2.629925, MSE=13.87887
##
## Node number 344: 6564 observations
## mean=0.3638026, MSE=0.9358988
##
## Node number 345: 22490 observations
## mean=1.861316, MSE=14.48895
#Plot the initial tree#
rpart.plot(tree_model, type = 3, extra = 101, fallen.leaves = TRUE,
main = "Initial Regression Tree")
#Perform cross-validation and choose the optimal complexity parameter
(cp)# #The optimal cp minimizes the cross-validated relative error#
optimal_cp <-
tree_model$cptable[which.min(tree_model$cptable[,"xerror"]),"CP"]
cat("Optimal CP:", optimal_cp, "\n")
## Optimal CP: 0.001
#Pruning is Redundant over here as the initial tree closely matches the structure of the pruned tree#
#Performance of the initial tree
cat("Initial Tree Size:", nrow(tree_model$frame), "\n")
## Initial Tree Size: 39
#Variable Importance#
tree_model$variable.importance
## sell_price category_of_product state
## 1026291.44 246158.99 57041.83
#Use the test data to evaluate performance of pruned regression tree#
pred.tree = predict(tree_model, test_data)
#Calcualte the RMSE for the pruned tree#
reg_tree_rmse <- sqrt(mean((pred.tree - test_data$sales)^2))
reg_tree_R2 <- R2(pred.tree, test_data$sales)
cat("Regression Tree Test RMSE:", reg_tree_rmse, "\n")
## Regression Tree Test RMSE: 3.100372
cat("Regression Tree Test R2:", reg_tree_R2, "\n")
## Regression Tree Test R2: 0.2642599
#Defining the grid of hyperparameters to tune (only mtry)#
tune_grid <- expand.grid(
mtry = c( 2, 3) # Test different values for mtry
)
#Training the Random Forest model with cross-validation#
rf_tune_model <- train(
sales ~ sell_price + state + category_of_product,
data = train_data,
method = "rf",
tuneGrid = tune_grid,
ntree = 100, # Specify ntree here
trControl = trainControl(method = "cv", number = 5) # 5-fold cross-validation
)
#View the best model parameters#
print(rf_tune_model)
## Random Forest
##
## 316000 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 252801, 252800, 252799, 252800, 252800
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 3.111332 0.2599662 1.329987
## 3 3.055998 0.2760216 1.252296
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#Training the final model with the best mtry value (mtry = 3)#
rf_final_model <- randomForest(
sales ~ sell_price + state + category_of_product,
data = train_data,
mtry = 3, # Use the best mtry value
ntree = 100 # Number of trees in the forest
)
#Prediction on the test dataset#
rf_final_preds <- predict(rf_final_model, newdata = test_data)
#Calculating RMSE to evaluate performance on the test data#
rf_final_rmse <- sqrt(mean((rf_final_preds - test_data$sales)^2))
rf_final_R2 <- R2(rf_final_preds, test_data$sales)
cat("Final Random Forest RMSE:", rf_final_rmse, "\n")
## Final Random Forest RMSE: 3.083294
cat("Regression Tree Test R2:", rf_final_R2, "\n")
## Regression Tree Test R2: 0.2723996
#Feature importance from the trained Random Forest model#
importance(rf_final_model)
## IncNodePurity
## sell_price 951317.68
## state 76029.61
## category_of_product 121007.41
#Visualizing feature importance#
varImpPlot(rf_final_model)
#Preparing the data matrix (X) and response variable (Y) from pre-split data#
x_train <- model.matrix(sales ~ sell_price + state + category_of_product, train_data)[, -1]
y_train <- train_data$sales
x_test <- model.matrix(sales ~ sell_price + state + category_of_product, test_data)[, -1]
y_test <- test_data$sales
#Defining a grid of lambda values to test#
grid <- 10^seq(10, -2, length = 100)
#Defining and training the Ridge regression model#
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = grid)
summary(ridge_model)
## Length Class Mode
## a0 100 -none- numeric
## beta 500 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
#Cross-validation to find the best lambda#
cv_out <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda <- cv_out$lambda.min
best_lambda
## [1] 0.07121907
#Predictions using the best lambda#
ridge_pred_test <- predict(ridge_model, s = best_lambda, newx = x_test)
#Refit the model with the best lambda on the full data set#
updated_ridge_model <- glmnet(x_train, y_train, alpha = 0)
predict(ridge_model, type = "coefficients", s = best_lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.59094812
## sell_price -0.11931694
## stateMaharashtra 0.16565213
## stateTelangana -0.08095248
## category_of_productfast_moving_consumer_goods -1.19255724
## category_of_productothers -1.45124405
#Calculate RMSE for the test dataset#
ridge_test_rmse <- sqrt(mean((ridge_pred_test - y_test)^2))
#Calculate R-squared for the test dataset#
ridge_test_rsquared <- 1 - sum((ridge_pred_test - y_test)^2) / sum((y_test - mean(y_test))^2)
#Print the RMSE and R-squared values for the test dataset#
cat("Ridge Regression Test RMSE:", ridge_test_rmse, "\n")
## Ridge Regression Test RMSE: 3.500085
cat("Ridge Regression Test R-squared:", ridge_test_rsquared, "\n")
## Ridge Regression Test R-squared: 0.06227416
#Convert categorical variables to one-hot encoding#
train_data_encoded <- model.matrix(sales ~ . - 1, data = train_data)
test_data_encoded <- model.matrix(sales ~ . - 1, data = test_data)
#Creating DMatrix for XGBoost#
train_matrix <- xgb.DMatrix(data = as.matrix(train_data_encoded), label = train_data$sales)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data_encoded), label = test_data$sales)
#Hyperparameter Tuning Grid#
#Setting XGBoost parameters#
params <- list(
objective = "reg:squarederror", # regression task
eval_metric = "rmse", # evaluation metric
booster = "gbtree", # tree-based boosting
eta = 0.1, # learning rate
max_depth = 4, # maximum depth of trees
colsample_bytree = 0.6, # feature subsampling
subsample = 0.7, # row subsampling
lambda = 1 # L2 regularization
)
xgb_model <- xgb.train(params = params, data = train_matrix, nrounds = 50, verbose = 0)
#Predict on the test set#
xgb_preds <- predict(xgb_model, test_matrix)
#Evaluate the model#
xgb_rmse <- sqrt(mean((xgb_preds - test_data$sales)^2))
xgb_r2 <- cor(xgb_preds, test_data$sales)^2
cat("xgb_RMSE :", xgb_rmse, "\n", "xgb_r2 :", xgb_r2, "\n")
## xgb_RMSE : 2.84988
## xgb_r2 : 0.3851964
r_squared_data <- data.frame(
Model = c("Multiple Linear Regression", "Decision Tree model", "Random Forest model", "Ridge Regression", "XGBoost Model"),
R_squared = c(test_R2,
reg_tree_R2, rf_final_R2, ridge_test_rsquared, xgb_r2)
)
r_squared_data
## Model R_squared
## 1 Multiple Linear Regression 0.06228829
## 2 Decision Tree model 0.26425988
## 3 Random Forest model 0.27239959
## 4 Ridge Regression 0.06227416
## 5 XGBoost Model 0.38519638
rmse_data <- data.frame(
Model = c("Multiple Linear Regression", "Decision Tree model", "Random Forest model",
"Ridge Regression", "XGBoost Model"),
RMSE = c(test_rmse, reg_tree_rmse, rf_final_rmse, ridge_test_rmse, xgb_rmse)
)
rmse_data
## Model RMSE
## 1 Multiple Linear Regression 3.500064
## 2 Decision Tree model 3.100372
## 3 Random Forest model 3.083294
## 4 Ridge Regression 3.500085
## 5 XGBoost Model 2.849880
#Viewing the combined data#
combined_data <- merge(r_squared_data, rmse_data, by = "Model")
print(combined_data)
## Model R_squared RMSE
## 1 Decision Tree model 0.26425988 3.100372
## 2 Multiple Linear Regression 0.06228829 3.500064
## 3 Random Forest model 0.27239959 3.083294
## 4 Ridge Regression 0.06227416 3.500085
## 5 XGBoost Model 0.38519638 2.849880
actual_values <- test_data$sales
Creating a dataframe with actual and predicted values
plot_data <- data.frame(
Actual = actual_values,
Predicted = xgb_preds
)
#Plot Actual vs Predicted#
ggplot(plot_data, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
theme_minimal() +
ggtitle("Actual vs Predicted (XGBoost Model)") +
xlab("Actual Values") +
ylab("Predicted Values")
## `geom_smooth()` using formula = 'y ~ x'
##Key Observations from the Plot——–##
Clustering Near Lower Values: There is a high density of points near lower actual and predicted values (closer to zero). This suggests the model performs relatively well for lower sales or smaller target values.
Deviation for Higher Values: For higher actual values (above ~50), many points deviate significantly from the blue line. This indicates that the model struggles to predict higher values accurately, leading to underestimation or overestimation.
Spread of Errors: The spread of points around the blue line reflects residuals (errors in prediction). A wide spread indicates higher error variance.
#Calculate residuals#
residuals <- actual_values - xgb_preds
#Create a residual plot#
ggplot(data.frame(Residuals = residuals), aes(x = Residuals)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black", alpha = 0.7) +
theme_minimal() +
ggtitle("Residual Plot (XGBoost Model)") +
xlab("Residuals")
##Interpretation of Model Performance——–##
Strengths: The high concentration of residuals around 0 suggests the model performs well for most of the data points. It captures the general trends in the data effectively.
Weaknesses: Outliers or high residuals (positive side) indicate the model struggles to predict higher actual values accurately.
This could be due to: A lack of relevant features. Insufficient complexity in the model for these cases. Skewed or imbalanced target data distribution.