library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("C:\\Users\\Dell\\Downloads")
clothing <- read.csv("Clothing.csv")
str(clothing)
## 'data.frame': 400 obs. of 14 variables:
## $ rownames: int 1 2 3 4 5 6 7 8 9 10 ...
## $ tsales : int 750000 1926395 1250000 694227 750000 400000 1300000 495340 1200000 495340 ...
## $ sales : num 4412 4281 4167 2670 15000 ...
## $ margin : num 41 39 40 40 44 41 39 28 41 37 ...
## $ nown : num 1 2 1 1 2 ...
## $ nfull : num 1 2 2 1 1.96 ...
## $ npart : num 1 3 2.22 1.28 1.28 ...
## $ naux : num 1.54 1.54 1.41 1.37 1.37 ...
## $ hoursw : int 76 192 114 100 104 72 161 80 158 87 ...
## $ hourspw : num 16.8 22.5 17.2 21.5 15.7 ...
## $ inv1 : num 17167 17167 292857 22207 22207 ...
## $ inv2 : num 27177 27177 71571 15000 10000 ...
## $ ssize : int 170 450 300 260 50 90 400 100 450 75 ...
## $ start : num 41 39 40 40 44 41 39 28 41 37 ...
summary(clothing)
## rownames tsales sales margin
## Min. : 1.0 Min. : 50000 Min. : 300 Min. :16.00
## 1st Qu.:100.8 1st Qu.: 495340 1st Qu.: 3904 1st Qu.:37.00
## Median :200.5 Median : 694227 Median : 5279 Median :39.00
## Mean :200.5 Mean : 833584 Mean : 6335 Mean :38.77
## 3rd Qu.:300.2 3rd Qu.: 976817 3rd Qu.: 7740 3rd Qu.:41.00
## Max. :400.0 Max. :5000000 Max. :27000 Max. :66.00
## nown nfull npart naux
## Min. : 1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.000 1st Qu.:1.923 1st Qu.:1.283 1st Qu.:1.333
## Median : 1.000 Median :1.956 Median :1.283 Median :1.367
## Mean : 1.284 Mean :2.069 Mean :1.566 Mean :1.390
## 3rd Qu.: 1.295 3rd Qu.:2.066 3rd Qu.:2.000 3rd Qu.:1.367
## Max. :10.000 Max. :8.000 Max. :9.000 Max. :4.000
## hoursw hourspw inv1 inv2
## Min. : 32.0 Min. : 5.708 Min. : 1000 Min. : 350
## 1st Qu.: 80.0 1st Qu.:13.541 1st Qu.: 20000 1st Qu.: 10000
## Median :104.0 Median :17.745 Median : 22207 Median : 22860
## Mean :121.1 Mean :18.955 Mean : 58257 Mean : 27829
## 3rd Qu.:145.2 3rd Qu.:24.303 3rd Qu.: 62269 3rd Qu.: 22860
## Max. :582.0 Max. :43.326 Max. :1500000 Max. :400000
## ssize start
## Min. : 16.0 Min. :16.00
## 1st Qu.: 80.0 1st Qu.:37.00
## Median : 120.0 Median :40.00
## Mean : 151.1 Mean :42.81
## 3rd Qu.: 190.0 3rd Qu.:42.00
## Max. :1214.0 Max. :90.00
head(clothing)
library(dplyr): dplyr санг ашиглан өгөгдөл боловсруулахад хялбар болгоно. read.csv(“Clothing.csv”): Өгөгдлийг CSV файлаас уншина. str(clothing): Өгөгдлийн бүтэц (багана, өгөгдлийн төрөл гэх мэт) харуулна. summary(clothing): Бүх хувьсагчдын дундаж, медиан, хамгийн бага, хамгийн их утга зэрэг статистикийг харуулна. head(clothing): Өгөгдлийн эхний хэдэн мөрийг харуулна.
clothing %>%
summarise(
Mean_Sales = mean(sales, na.rm = TRUE),
Median_Sales = median(sales, na.rm = TRUE),
SD_Sales = sd(sales, na.rm = TRUE),
Mean_Margin = mean(margin, na.rm = TRUE),
Mean_Hours_Worked = mean(hoursw, na.rm = TRUE)
)
mean(sales, na.rm = TRUE) – Борлуулалтын дундаж утгыг тооцоолно. median(sales, na.rm = TRUE) – Борлуулалтын медианыг олно. sd(sales, na.rm = TRUE) – Борлуулалтын стандарт хазайлт (өргөн тархалттай эсэх). mean(margin, na.rm = TRUE) – Нийт ашгийн харьцааны дундаж утга. mean(hoursw, na.rm = TRUE) – Нийт ажилласан цагийн дундаж.
library(ggplot2)
ggplot(clothing, aes(x = as.factor(nown))) +
geom_bar(fill = "orange", alpha = 0.7) +
theme_minimal() +
labs(title = "Number of Owners per Store", x = "Number of Owners", y = "Count")
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.2
clothing_workers <- clothing[, c("nfull", "npart")]
clothing_workers_melted <- melt(clothing_workers)
## No id variables; using all as measure variables
ggplot(clothing_workers_melted, aes(x = variable, y = value)) +
geom_bar(stat = "summary", fun = mean, fill = c("palevioletred2", "lightblue"), alpha = 0.7) +
theme_minimal() +
labs(title = "Average Number of Full-Time vs. Part-Time Workers", x = "Worker Type", y = "Average Count")
# Жилийн борлуулалтын хистограм
ggplot(clothing, aes(x = tsales)) +
geom_histogram(fill = "orchid4", bins = 30, alpha = 0.7) +
labs(title = "Distribution of Total Sales", x = "Total Sales", y = "Count")
# Boxplot of sales per square meter
ggplot(clothing, aes(y = sales)) +
geom_boxplot(fill = "lightblue", alpha = 0.6) +
labs(title = "Boxplot of Sales per Square Meter", y = "Sales per Square Meter")
library(ggplot2)
ggplot(clothing, aes(x = sales)) +
geom_density(fill = "pink", alpha = 0.5) +
geom_rug(color = "purple", alpha = 0.8) +
theme_minimal() +
labs(title = "Sales Distribution", x = "Sales", y = "Density")
ggplot(clothing, aes(y = margin)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
theme_minimal() +
labs(title = "Profit Margin Boxplot", y = "Margin")
ggplot(clothing, aes(x = ssize, y = sales)) +
geom_point(alpha = 0.5, color = "purple") +
geom_smooth(method = "lm", color = "red") +
theme_minimal() +
labs(title = "Sales vs. Store Size", x = "Store Size (m²)", y = "Sales per m²")
## `geom_smooth()` using formula = 'y ~ x'
clothing_investments <- clothing[, c("inv1", "inv2")]
clothing_investments_melted <- melt(clothing_investments)
## No id variables; using all as measure variables
ggplot(clothing_investments_melted, aes(x = variable, y = value)) +
geom_bar(stat = "summary", fun = mean, fill = c("green", "blue"), alpha = 0.7) +
theme_minimal() +
labs(title = "Average Investments: Shop vs. Automation", x = "Investment Type", y = "Average Amount")
ggplot(clothing, aes(x = start)) +
geom_histogram(binwidth = 5, fill = "darkgreen", color = "white", alpha = 0.7) +
theme_minimal() +
labs(title = "Business Start Year Distribution", x = "Year", y = "Number of Stores")
clothing$more_part_timers <- ifelse(clothing$npart > clothing$nfull, "More Part-Timers", "More Full-Timers")
ggplot(clothing, aes(x = more_part_timers, fill = more_part_timers)) +
geom_bar(alpha = 0.7) +
theme_minimal() +
labs(title = "Stores with More Part-Timers vs. Full-Timers", x = "Worker Type Majority", y = "Count")
ggplot(clothing, aes(x = "", y = sales)) +
geom_violin(fill = "lightblue", alpha = 0.6) +
geom_boxplot(width = 0.2, fill = "white", color = "blue") +
geom_jitter(color = "darkblue", alpha = 0.4, width = 0.1) +
theme_minimal() +
labs(title = "Raincloud Plot: Sales Distribution", y = "Sales")
clothing$margin_category <- cut(clothing$margin, breaks = c(0, 20, 40, 100),
labels = c("Low", "Medium", "High"))
ggplot(clothing, aes(x = margin_category, fill = margin_category)) +
geom_bar(alpha = 0.7) +
theme_minimal() +
labs(title = "Stores Classified by Profit Margin", x = "Profit Margin Category", y = "Count")
library(waffle)
## Warning: package 'waffle' was built under R version 4.4.2
library(RColorBrewer)
df_nfull <- round(table(clothing$nfull) / sum(table(clothing$nfull)) * 100)
colors <- brewer.pal(min(length(df_nfull), 13), "Set3")
waffle(df_nfull, rows = 10, colors = colors) +
labs(title = "Proportion of Full-time Employees Across Stores")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
# Зөвхөн тоон хувьсагчдыг сонгож корреляци тооцоолох
cor_matrix <- cor(clothing[, c("tsales", "sales", "margin", "nown", "nfull", "npart", "naux", "hoursw")], use = "complete.obs")
# Корреляцийн матриц хэвлэх
print(cor_matrix)
## tsales sales margin nown nfull npart
## tsales 1.0000000 0.46988817 0.2409816 0.128367110 0.56500853 0.39084445
## sales 0.4698882 1.00000000 0.1373499 0.147894035 0.23718542 0.05008504
## margin 0.2409816 0.13734990 1.0000000 0.052375501 0.10944391 0.18373028
## nown 0.1283671 0.14789404 0.0523755 1.000000000 0.06210814 0.05710353
## nfull 0.5650085 0.23718542 0.1094439 0.062108141 1.00000000 0.28879875
## npart 0.3908444 0.05008504 0.1837303 0.057103526 0.28879875 1.00000000
## naux 0.1810007 -0.01429326 -0.1013364 0.006628208 0.08422274 0.03732837
## hoursw 0.7091963 0.26299664 0.2960733 0.402467525 0.53131829 0.24908371
## naux hoursw
## tsales 0.181000688 0.7091963
## sales -0.014293261 0.2629966
## margin -0.101336417 0.2960733
## nown 0.006628208 0.4024675
## nfull 0.084222744 0.5313183
## npart 0.037328366 0.2490837
## naux 1.000000000 0.2096814
## hoursw 0.209681386 1.0000000
# Корреляцийн график
corrplot(cor_matrix, method = "color", type = "upper")
model1 <- lm(sales ~ ssize, data = clothing)
summary(model1)
##
## Call:
## lm(formula = sales ~ ssize, data = clothing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6071.3 -2194.5 -813.2 1139.8 20166.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7809.808 299.817 26.049 < 2e-16 ***
## ssize -9.765 1.593 -6.132 2.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3579 on 398 degrees of freedom
## Multiple R-squared: 0.08631, Adjusted R-squared: 0.08402
## F-statistic: 37.6 on 1 and 398 DF, p-value: 2.097e-09
lm(sales ~ ssize, data = clothing) – Борлуулалт (sales) ба дэлгүүрийн талбайн хэмжээ (ssize) хоорондын хамаарлыг үнэлнэ. summary(model1) – R-squared, p-value, Coefficients зэрэг статистикуудыг өгнө.
lm(sales ~ ssize, data = clothing)
##
## Call:
## lm(formula = sales ~ ssize, data = clothing)
##
## Coefficients:
## (Intercept) ssize
## 7809.808 -9.765
summary(model1)
##
## Call:
## lm(formula = sales ~ ssize, data = clothing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6071.3 -2194.5 -813.2 1139.8 20166.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7809.808 299.817 26.049 < 2e-16 ***
## ssize -9.765 1.593 -6.132 2.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3579 on 398 degrees of freedom
## Multiple R-squared: 0.08631, Adjusted R-squared: 0.08402
## F-statistic: 37.6 on 1 and 398 DF, p-value: 2.097e-09
Борлуулалт (sales) ба дэлгүүрийн талбайн хэмжээ (ssize) хоорондын хамаарлыг үнэлнэ. R-squared, p-value, Coefficients зэрэг статистикуудыг өгнө.
ggplot(clothing, aes(x = start, y = tsales)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
theme_minimal() +
labs(title = "Annual Sales vs. Store Start Year", x = "Start Year", y = "Annual Sales")
## `geom_smooth()` using formula = 'y ~ x'
clothing_scaled <- scale(clothing[, c("tsales", "sales", "margin", "ssize")])
kmeans_result <- kmeans(clothing_scaled, centers = 3)
clothing$cluster <- as.factor(kmeans_result$cluster)
ggplot(clothing, aes(x = ssize, y = tsales, color = cluster)) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "Store Clusters: Size vs. Sales", x = "Store Size", y = "Annual Sales")
ggplot(clothing, aes(x = ssize, y = sales)) +
geom_bin2d() +
scale_fill_viridis_c() +
theme_minimal() +
labs(title = "Sales Density by Store Size", x = "Store Size (m²)", y = "Sales per m²")
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
model_rf <- randomForest(tsales ~ sales + margin + ssize + nfull + npart + inv1 + inv2, data = clothing, importance = TRUE)
importance(model_rf)
## %IncMSE IncNodePurity
## sales 33.970165 3.185523e+13
## margin 4.335011 7.600449e+12
## ssize 36.057816 3.357273e+13
## nfull 26.341345 3.543838e+13
## npart 8.967630 1.056988e+13
## inv1 7.700207 4.497323e+12
## inv2 2.633371 6.218003e+12
varImpPlot(model_rf)
library(moments)
skewness(clothing$tsales, na.rm = TRUE)
## [1] 2.380106
kurtosis(clothing$tsales, na.rm = TRUE)
## [1] 13.16138
skewness(clothing$margin, na.rm = TRUE)
## [1] -0.3475286
kurtosis(clothing$margin, na.rm = TRUE)
## [1] 6.726451