library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
orders <- read.csv("C:/Users/HARDEEP DESHWAL/Downloads/archive (3)/List of Orders.csv")
details <- read.csv("C:/Users/HARDEEP DESHWAL/Downloads/archive (3)/Order Details.csv")
sales_target <- read.csv("C:/Users/HARDEEP DESHWAL/Downloads/archive (3)/Sales target.csv")
View(orders)
View(details)
str(orders)
## 'data.frame': 560 obs. of 5 variables:
## $ Order.ID : chr "B-25601" "B-25602" "B-25603" "B-25604" ...
## $ Order.Date : chr "01-04-2018" "01-04-2018" "03-04-2018" "03-04-2018" ...
## $ CustomerName: chr "Bharat" "Pearl" "Jahan" "Divsha" ...
## $ State : chr "Gujarat" "Maharashtra" "Madhya Pradesh" "Rajasthan" ...
## $ City : chr "Ahmedabad" "Pune" "Bhopal" "Jaipur" ...
str(details)
## 'data.frame': 1500 obs. of 6 variables:
## $ Order.ID : chr "B-25601" "B-25601" "B-25601" "B-25601" ...
## $ Amount : num 1275 66 8 80 168 ...
## $ Profit : num -1148 -12 -2 -56 -111 ...
## $ Quantity : int 7 5 3 4 2 5 4 3 8 5 ...
## $ Category : chr "Furniture" "Clothing" "Clothing" "Electronics" ...
## $ Sub.Category: chr "Bookcases" "Stole" "Hankerchief" "Electronic Games" ...
names(orders)
## [1] "Order.ID" "Order.Date" "CustomerName" "State" "City"
names(details)
## [1] "Order.ID" "Amount" "Profit" "Quantity" "Category"
## [6] "Sub.Category"
head(orders)
## Order.ID Order.Date CustomerName State City
## 1 B-25601 01-04-2018 Bharat Gujarat Ahmedabad
## 2 B-25602 01-04-2018 Pearl Maharashtra Pune
## 3 B-25603 03-04-2018 Jahan Madhya Pradesh Bhopal
## 4 B-25604 03-04-2018 Divsha Rajasthan Jaipur
## 5 B-25605 05-04-2018 Kasheen West Bengal Kolkata
## 6 B-25606 06-04-2018 Hazel Karnataka Bangalore
head(details)
## Order.ID Amount Profit Quantity Category Sub.Category
## 1 B-25601 1275 -1148 7 Furniture Bookcases
## 2 B-25601 66 -12 5 Clothing Stole
## 3 B-25601 8 -2 3 Clothing Hankerchief
## 4 B-25601 80 -56 4 Electronics Electronic Games
## 5 B-25602 168 -111 2 Electronics Phones
## 6 B-25602 424 -272 5 Electronics Phones
tail(orders)
## Order.ID Order.Date CustomerName State City
## 555
## 556
## 557
## 558
## 559
## 560
tail(details)
## Order.ID Amount Profit Quantity Category Sub.Category
## 1495 B-26099 207 37 4 Clothing Hankerchief
## 1496 B-26099 835 267 5 Electronics Phones
## 1497 B-26099 2366 552 5 Clothing Trousers
## 1498 B-26100 828 230 2 Furniture Chairs
## 1499 B-26100 34 10 2 Clothing T-shirt
## 1500 B-26100 72 16 2 Clothing Shirt
summary(orders)
## Order.ID Order.Date CustomerName State
## Length:560 Length:560 Length:560 Length:560
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## City
## Length:560
## Class :character
## Mode :character
summary(details)
## Order.ID Amount Profit Quantity
## Length:1500 Min. : 4.0 Min. :-1981.00 Min. : 1.000
## Class :character 1st Qu.: 45.0 1st Qu.: -9.25 1st Qu.: 2.000
## Mode :character Median : 118.0 Median : 9.00 Median : 3.000
## Mean : 287.7 Mean : 15.97 Mean : 3.743
## 3rd Qu.: 322.0 3rd Qu.: 38.00 3rd Qu.: 5.000
## Max. :5729.0 Max. : 1698.00 Max. :14.000
## Category Sub.Category
## Length:1500 Length:1500
## Class :character Class :character
## Mode :character Mode :character
##
##
##
dim(orders)
## [1] 560 5
dim(details)
## [1] 1500 6
colSums(is.na(orders))
## Order.ID Order.Date CustomerName State City
## 0 0 0 0 0
colSums(is.na(details))
## Order.ID Amount Profit Quantity Category Sub.Category
## 0 0 0 0 0 0
ecommerce <- merge(orders, details, by = "Order.ID")
View(ecommerce)
# Structure of merged data
str(ecommerce)
## 'data.frame': 1500 obs. of 10 variables:
## $ Order.ID : chr "B-25601" "B-25601" "B-25601" "B-25601" ...
## $ Order.Date : chr "01-04-2018" "01-04-2018" "01-04-2018" "01-04-2018" ...
## $ CustomerName: chr "Bharat" "Bharat" "Bharat" "Bharat" ...
## $ State : chr "Gujarat" "Gujarat" "Gujarat" "Gujarat" ...
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Amount : num 1275 66 8 80 168 ...
## $ Profit : num -1148 -12 -2 -56 -111 ...
## $ Quantity : int 7 5 3 4 2 5 4 3 8 5 ...
## $ Category : chr "Furniture" "Clothing" "Clothing" "Electronics" ...
## $ Sub.Category: chr "Bookcases" "Stole" "Hankerchief" "Electronic Games" ...
ecommerce$Amount <- as.numeric(ecommerce$Amount)
ecommerce$Quantity <- as.numeric(ecommerce$Quantity)
ecommerce$Profit <- as.numeric(ecommerce$Profit)
high_sales <- ecommerce %>%
filter(Amount > 1000) %>%
select(Order.ID, Amount)
high_sales
## Order.ID Amount
## 1 B-25601 1275
## 2 B-25602 2617
## 3 B-25603 1355
## 4 B-25608 1364
## 5 B-25610 1076
## 6 B-25613 1603
## 7 B-25626 1103
## 8 B-25629 1560
## 9 B-25639 1629
## 10 B-25643 1061
## 11 B-25653 1279
## 12 B-25653 1327
## 13 B-25656 1389
## 14 B-25657 1021
## 15 B-25667 1030
## 16 B-25676 1263
## 17 B-25681 1625
## 18 B-25681 1096
## 19 B-25686 1829
## 20 B-25697 1300
## 21 B-25728 1055
## 22 B-25729 1549
## 23 B-25730 1145
## 24 B-25738 1069
## 25 B-25749 1052
## 26 B-25752 1361
## 27 B-25755 1709
## 28 B-25757 3151
## 29 B-25761 2188
## 30 B-25762 1316
## 31 B-25768 1582
## 32 B-25772 1183
## 33 B-25777 1076
## 34 B-25778 1506
## 35 B-25779 1361
## 36 B-25786 1854
## 37 B-25793 1402
## 38 B-25797 1630
## 39 B-25798 2830
## 40 B-25810 1120
## 41 B-25823 2103
## 42 B-25830 1063
## 43 B-25830 1954
## 44 B-25836 1298
## 45 B-25839 1250
## 46 B-25842 1543
## 47 B-25853 2093
## 48 B-25855 1027
## 49 B-25855 1319
## 50 B-25858 2457
## 51 B-25862 2061
## 52 B-25868 1118
## 53 B-25873 1275
## 54 B-25877 1137
## 55 B-25881 1351
## 56 B-25881 2244
## 57 B-25881 2115
## 58 B-25887 2125
## 59 B-25894 1246
## 60 B-25902 1700
## 61 B-25909 1622
## 62 B-25910 1622
## 63 B-25919 1599
## 64 B-25923 3873
## 65 B-25925 1228
## 66 B-25929 1308
## 67 B-25935 1657
## 68 B-25937 1101
## 69 B-25943 1547
## 70 B-25950 1622
## 71 B-25953 1218
## 72 B-25955 1716
## 73 B-25955 2927
## 74 B-25957 1157
## 75 B-25964 1270
## 76 B-25969 2452
## 77 B-25973 4141
## 78 B-25978 1063
## 79 B-25993 4363
## 80 B-25995 1314
## 81 B-25997 2292
## 82 B-26003 1745
## 83 B-26006 1301
## 84 B-26022 1824
## 85 B-26023 1117
## 86 B-26028 1272
## 87 B-26048 1461
## 88 B-26048 1104
## 89 B-26051 1337
## 90 B-26055 5729
## 91 B-26055 1218
## 92 B-26067 1120
## 93 B-26067 1137
## 94 B-26073 1514
## 95 B-26085 1487
## 96 B-26093 2847
## 97 B-26099 2366
head(high_sales)
## Order.ID Amount
## 1 B-25601 1275
## 2 B-25602 2617
## 3 B-25603 1355
## 4 B-25608 1364
## 5 B-25610 1076
## 6 B-25613 1603
top_orders <- ecommerce %>%
arrange(desc(Amount)) %>%
select(Order.ID, Amount) %>%
head(10)
top_orders
## Order.ID Amount
## 1 B-26055 5729
## 2 B-25993 4363
## 3 B-25973 4141
## 4 B-25923 3873
## 5 B-25757 3151
## 6 B-25955 2927
## 7 B-26093 2847
## 8 B-25798 2830
## 9 B-25602 2617
## 10 B-25858 2457
rank_orders <- ecommerce %>%
arrange(desc(Amount)) %>%
mutate(rank = row_number())
head(rank_orders)
## Order.ID Order.Date CustomerName State City Amount Profit
## 1 B-26055 10-03-2019 Yaanvi Madhya Pradesh Indore 5729 64
## 2 B-25993 03-02-2019 Harshal Delhi Delhi 4363 305
## 3 B-25973 24-01-2019 Seema Uttar Pradesh Allahabad 4141 1698
## 4 B-25923 27-12-2018 Vishakha Maharashtra Mumbai 3873 891
## 5 B-25757 21-08-2018 Mohit Madhya Pradesh Indore 3151 -35
## 6 B-25955 16-01-2019 Soumya Maharashtra Pune 2927 146
## Quantity Category Sub.Category rank
## 1 14 Furniture Chairs 1
## 2 5 Furniture Tables 2
## 3 13 Electronics Printers 3
## 4 6 Electronics Phones 4
## 5 7 Clothing Trousers 5
## 6 8 Furniture Bookcases 6
topper <- rank_orders %>%
filter(rank == 1) %>%
select(Order.ID, Amount, rank)
topper
## Order.ID Amount rank
## 1 B-26055 5729 1
avg_top10 <- top_orders %>%
summarise(avg_sales = mean(Amount, na.rm = TRUE))
avg_top10
## avg_sales
## 1 3493.5
high_profit <- ecommerce %>%
arrange(desc(Profit)) %>%
select(Order.ID, Profit, Quantity)
head(high_profit, 10)
## Order.ID Profit Quantity
## 1 B-25973 1698 13
## 2 B-25602 1151 4
## 3 B-25761 1050 5
## 4 B-25923 891 6
## 5 B-25830 782 3
## 6 B-26073 742 4
## 7 B-25853 721 5
## 8 B-26093 712 8
## 9 B-25862 701 5
## 10 B-25656 680 7
low_profit <- ecommerce %>%
arrange(Profit) %>%
select(Order.ID, Profit, Quantity)
head(low_profit, 10)
## Order.ID Profit Quantity
## 1 B-25798 -1981 13
## 2 B-25608 -1864 5
## 3 B-26022 -1303 8
## 4 B-25601 -1148 7
## 5 B-25779 -980 3
## 6 B-25666 -916 7
## 7 B-25797 -802 5
## 8 B-25730 -706 3
## 9 B-25681 -658 7
## 10 B-25653 -640 8
ggplot(ecommerce, aes(x = Amount)) +
geom_histogram(
binwidth = 500,
fill = "red",
color = "black"
) +
labs(
title = "Distribution of Sales Amount",
x = "Amount",
y = "Count"
)

ecommerce$sales_group <- ifelse(
ecommerce$Amount >= 1000,
"High",
ifelse(
ecommerce$Amount >= 500,
"Medium",
"Low"
)
)
ggplot(ecommerce,
aes(x = sales_group,
fill = sales_group)) +
geom_bar() +
labs(
title = "Sales Categories",
x = "Category",
y = "Count"
)

ggplot(ecommerce,
aes(x = Quantity,
y = Amount)) +
geom_point(
color = "blue",
na.rm = TRUE
) +
labs(
title = "Quantity vs Amount",
x = "Quantity",
y = "Amount"
)

ggplot(ecommerce,
aes(x = Profit,
y = Amount)) +
geom_point(
color = "darkblue",
na.rm = TRUE
) +
labs(
title = "Profit vs Amount",
x = "Profit",
y = "Amount"
)

ggplot(ecommerce,
aes(x = Quantity)) +
geom_histogram(
binwidth = 1,
fill = "green",
color = "black"
) +
labs(
title = "Quantity Distribution",
x = "Quantity",
y = "Count"
)

ggplot(top_orders,
aes(x = reorder(Order.ID, Amount),
y = Amount)) +
geom_bar(
stat = "identity",
fill = "purple"
) +
coord_flip() +
labs(
title = "Top 10 Orders",
x = "Order ID",
y = "Amount"
)

category_sales <- ecommerce %>%
group_by(Category) %>%
summarise(total_sales = sum(Amount, na.rm = TRUE))
category_sales
## # A tibble: 3 × 2
## Category total_sales
## <chr> <dbl>
## 1 Clothing 139054
## 2 Electronics 165267
## 3 Furniture 127181
ggplot(category_sales,
aes(x = Category,
y = total_sales,
fill = Category)) +
geom_bar(stat = "identity") +
labs(
title = "Category vs Sales",
x = "Category",
y = "Total Sales"
)

state_sales <- ecommerce %>%
group_by(State) %>%
summarise(total_sales = sum(Amount, na.rm = TRUE))
state_sales
## # A tibble: 19 × 2
## State total_sales
## <chr> <dbl>
## 1 "Andhra Pradesh" 13256
## 2 "Bihar" 12943
## 3 "Delhi" 22531
## 4 "Goa" 6705
## 5 "Gujarat" 21058
## 6 "Haryana" 8863
## 7 "Himachal Pradesh" 8666
## 8 "Jammu and Kashmir" 10829
## 9 "Karnataka" 15058
## 10 "Kerala " 13459
## 11 "Madhya Pradesh" 105140
## 12 "Maharashtra" 95348
## 13 "Nagaland" 11903
## 14 "Punjab" 16786
## 15 "Rajasthan" 21149
## 16 "Sikkim" 5276
## 17 "Tamil Nadu" 6087
## 18 "Uttar Pradesh" 22359
## 19 "West Bengal" 14086
ggplot(state_sales,
aes(x = reorder(State, total_sales),
y = total_sales)) +
geom_bar(
stat = "identity",
fill = "orange"
) +
coord_flip() +
labs(
title = "State vs Sales",
x = "State",
y = "Total Sales"
)

plot(
ecdf(ecommerce$Amount),
main = "CDF of Sales Amount",
xlab = "Amount",
ylab = "Cumulative Probability",
col = "blue"
)

plot(
ecdf(ecommerce$Profit),
main = "CDF of Profit",
xlab = "Profit",
ylab = "Cumulative Probability",
col = "darkgreen"
)

ggplot(ecommerce,
aes(x = Category,
y = Amount,
fill = Category)) +
geom_boxplot() +
labs(
title = "Box Plot of Amount by Category",
x = "Category",
y = "Amount"
)

ggplot(ecommerce,
aes(x = Category,
y = Profit,
fill = Category)) +
geom_boxplot() +
labs(
title = "Box Plot of Profit by Category",
x = "Category",
y = "Profit"
)

ggplot(ecommerce,
aes(y = Quantity)) +
geom_boxplot(fill = "skyblue") +
labs(
title = "Box Plot of Quantity",
y = "Quantity"
)

anova_amount <- aov(
Amount ~ Category,
data = ecommerce
)
summary(anova_amount)
## Df Sum Sq Mean Sq F value Pr(>F)
## Category 2 51488804 25744402 144.3 <2e-16 ***
## Residuals 1497 267149957 178457
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_profit <- aov(
Profit ~ Category,
data = ecommerce
)
summary(anova_profit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Category 2 128025 64013 2.241 0.107
## Residuals 1497 42756162 28561
cor(
ecommerce$Amount,
ecommerce$Profit,
use = "complete.obs"
)
## [1] 0.2420446
cor(
ecommerce$Quantity,
ecommerce$Amount,
use = "complete.obs"
)
## [1] 0.3520518
numeric_data <- ecommerce %>%
select(Amount, Profit, Quantity)
cor(numeric_data)
## Amount Profit Quantity
## Amount 1.0000000 0.24204460 0.35205176
## Profit 0.2420446 1.00000000 0.00244136
## Quantity 0.3520518 0.00244136 1.00000000
single_reg <- lm(
Amount ~ Quantity,
data = ecommerce
)
summary(single_reg)
##
## Call:
## lm(formula = Amount ~ Quantity, data = ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -859.0 -201.4 -103.2 57.3 4679.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.586 22.116 0.433 0.665
## Quantity 74.287 5.103 14.558 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 431.7 on 1498 degrees of freedom
## Multiple R-squared: 0.1239, Adjusted R-squared: 0.1234
## F-statistic: 211.9 on 1 and 1498 DF, p-value: < 2.2e-16
ggplot(ecommerce,
aes(x = Quantity,
y = Amount)) +
geom_point(color = "blue") +
geom_smooth(
method = "lm",
se = FALSE,
color = "red"
) +
labs(
title = "Single Regression Plot",
x = "Quantity",
y = "Amount"
)
## `geom_smooth()` using formula = 'y ~ x'

single_reg2 <- lm(
Profit ~ Quantity,
data = ecommerce
)
summary(single_reg2)
##
## Call:
## lm(formula = Profit ~ Quantity, data = ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1998.72 -25.46 -6.92 22.18 1680.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.262 8.668 1.761 0.0785 .
## Quantity 0.189 2.000 0.094 0.9247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 169.2 on 1498 degrees of freedom
## Multiple R-squared: 5.96e-06, Adjusted R-squared: -0.0006616
## F-statistic: 0.008928 on 1 and 1498 DF, p-value: 0.9247
multiple_reg <- lm(
Amount ~ Quantity + Profit,
data = ecommerce
)
summary(multiple_reg)
##
## Call:
## lm(formula = Amount ~ Quantity + Profit, data = ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -837.6 -196.2 -98.7 76.9 4649.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.44862 21.39834 -0.021 0.983
## Quantity 74.16314 4.93226 15.036 <2e-16 ***
## Profit 0.65744 0.06371 10.318 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.2 on 1497 degrees of freedom
## Multiple R-squared: 0.1821, Adjusted R-squared: 0.181
## F-statistic: 166.7 on 2 and 1497 DF, p-value: < 2.2e-16
predicted_values <- predict(multiple_reg)
head(predicted_values)
## 1 2 3 4 5 6
## -236.04371 362.47785 220.72593 259.38750 74.90222 191.54437
ggplot(ecommerce,
aes(x = Profit,
y = Amount)) +
geom_point(color = "darkblue") +
geom_smooth(
method = "lm",
se = FALSE,
color = "red"
) +
labs(
title = "Profit vs Amount Regression",
x = "Profit",
y = "Amount"
)
## `geom_smooth()` using formula = 'y ~ x'

poly_reg <- lm(
Amount ~ poly(Quantity, 2),
data = ecommerce
)
summary(poly_reg)
##
## Call:
## lm(formula = Amount ~ poly(Quantity, 2), data = ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1175.4 -188.3 -114.6 45.9 4340.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 287.67 11.12 25.878 <2e-16 ***
## poly(Quantity, 2)1 6284.28 430.53 14.597 <2e-16 ***
## poly(Quantity, 2)2 1293.71 430.53 3.005 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 430.5 on 1497 degrees of freedom
## Multiple R-squared: 0.1292, Adjusted R-squared: 0.128
## F-statistic: 111 on 2 and 1497 DF, p-value: < 2.2e-16
ggplot(ecommerce,
aes(x = Quantity,
y = Amount)) +
geom_point(color = "purple") +
stat_smooth(
method = "lm",
formula = y ~ poly(x, 2),
se = FALSE,
color = "red"
) +
labs(
title = "Polynomial Regression Plot",
x = "Quantity",
y = "Amount"
)

poly_reg3 <- lm(
Profit ~ poly(Quantity, 3),
data = ecommerce
)
summary(poly_reg3)
##
## Call:
## lm(formula = Profit ~ poly(Quantity, 3), data = ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1984.87 -25.56 -6.81 22.15 1694.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.970 4.371 3.653 0.000268 ***
## poly(Quantity, 3)1 15.987 169.299 0.094 0.924778
## poly(Quantity, 3)2 -3.944 169.299 -0.023 0.981416
## poly(Quantity, 3)3 -72.659 169.299 -0.429 0.667856
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 169.3 on 1496 degrees of freedom
## Multiple R-squared: 0.0001294, Adjusted R-squared: -0.001876
## F-statistic: 0.06455 on 3 and 1496 DF, p-value: 0.9786
plot(
single_reg$fitted.values,
single_reg$residuals,
main = "Residual Plot",
xlab = "Fitted Values",
ylab = "Residuals",
col = "blue"
)
abline(h = 0, col = "red")

actual_predicted <- data.frame(
Actual = ecommerce$Amount,
Predicted = predicted_values
)
ggplot(actual_predicted,
aes(x = Actual,
y = Predicted)) +
geom_point(color = "darkgreen") +
labs(
title = "Actual vs Predicted Values",
x = "Actual Amount",
y = "Predicted Amount"
)

boxplot(
ecommerce$Amount,
main = "Outlier Detection",
col = "pink"
)

qqnorm(ecommerce$Amount)
qqline(
ecommerce$Amount,
col = "red"
)

cov(
ecommerce$Amount,
ecommerce$Profit,
use = "complete.obs"
)
## [1] 18875.2
ggplot(ecommerce,
aes(x = Amount)) +
geom_density(fill = "lightblue") +
labs(
title = "Density Plot of Amount",
x = "Amount",
y = "Density"
)
