R markdown Enkhjin

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

setwd("C:\\Users\\Dell\\Downloads") 
clothing <- read.csv("Clothing.csv")
str(clothing)

## 'data.frame':    400 obs. of  14 variables:
##  $ rownames: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ tsales  : int  750000 1926395 1250000 694227 750000 400000 1300000 495340 1200000 495340 ...
##  $ sales   : num  4412 4281 4167 2670 15000 ...
##  $ margin  : num  41 39 40 40 44 41 39 28 41 37 ...
##  $ nown    : num  1 2 1 1 2 ...
##  $ nfull   : num  1 2 2 1 1.96 ...
##  $ npart   : num  1 3 2.22 1.28 1.28 ...
##  $ naux    : num  1.54 1.54 1.41 1.37 1.37 ...
##  $ hoursw  : int  76 192 114 100 104 72 161 80 158 87 ...
##  $ hourspw : num  16.8 22.5 17.2 21.5 15.7 ...
##  $ inv1    : num  17167 17167 292857 22207 22207 ...
##  $ inv2    : num  27177 27177 71571 15000 10000 ...
##  $ ssize   : int  170 450 300 260 50 90 400 100 450 75 ...
##  $ start   : num  41 39 40 40 44 41 39 28 41 37 ...

summary(clothing)

##     rownames         tsales            sales           margin     
##  Min.   :  1.0   Min.   :  50000   Min.   :  300   Min.   :16.00  
##  1st Qu.:100.8   1st Qu.: 495340   1st Qu.: 3904   1st Qu.:37.00  
##  Median :200.5   Median : 694227   Median : 5279   Median :39.00  
##  Mean   :200.5   Mean   : 833584   Mean   : 6335   Mean   :38.77  
##  3rd Qu.:300.2   3rd Qu.: 976817   3rd Qu.: 7740   3rd Qu.:41.00  
##  Max.   :400.0   Max.   :5000000   Max.   :27000   Max.   :66.00  
##       nown            nfull           npart            naux      
##  Min.   : 1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 1.000   1st Qu.:1.923   1st Qu.:1.283   1st Qu.:1.333  
##  Median : 1.000   Median :1.956   Median :1.283   Median :1.367  
##  Mean   : 1.284   Mean   :2.069   Mean   :1.566   Mean   :1.390  
##  3rd Qu.: 1.295   3rd Qu.:2.066   3rd Qu.:2.000   3rd Qu.:1.367  
##  Max.   :10.000   Max.   :8.000   Max.   :9.000   Max.   :4.000  
##      hoursw         hourspw            inv1              inv2       
##  Min.   : 32.0   Min.   : 5.708   Min.   :   1000   Min.   :   350  
##  1st Qu.: 80.0   1st Qu.:13.541   1st Qu.:  20000   1st Qu.: 10000  
##  Median :104.0   Median :17.745   Median :  22207   Median : 22860  
##  Mean   :121.1   Mean   :18.955   Mean   :  58257   Mean   : 27829  
##  3rd Qu.:145.2   3rd Qu.:24.303   3rd Qu.:  62269   3rd Qu.: 22860  
##  Max.   :582.0   Max.   :43.326   Max.   :1500000   Max.   :400000  
##      ssize            start      
##  Min.   :  16.0   Min.   :16.00  
##  1st Qu.:  80.0   1st Qu.:37.00  
##  Median : 120.0   Median :40.00  
##  Mean   : 151.1   Mean   :42.81  
##  3rd Qu.: 190.0   3rd Qu.:42.00  
##  Max.   :1214.0   Max.   :90.00

head(clothing)

library(dplyr): dplyr санг ашиглан өгөгдөл боловсруулахад хялбар болгоно. read.csv(“Clothing.csv”): Өгөгдлийг CSV файлаас уншина. str(clothing): Өгөгдлийн бүтэц (багана, өгөгдлийн төрөл гэх мэт) харуулна. summary(clothing): Бүх хувьсагчдын дундаж, медиан, хамгийн бага, хамгийн их утга зэрэг статистикийг харуулна. head(clothing): Өгөгдлийн эхний хэдэн мөрийг харуулна.

clothing %>%
  summarise(
    Mean_Sales = mean(sales, na.rm = TRUE),
    Median_Sales = median(sales, na.rm = TRUE),
    SD_Sales = sd(sales, na.rm = TRUE),
    Mean_Margin = mean(margin, na.rm = TRUE),
    Mean_Hours_Worked = mean(hoursw, na.rm = TRUE)
  )

mean(sales, na.rm = TRUE) – Борлуулалтын дундаж утгыг тооцоолно. median(sales, na.rm = TRUE) – Борлуулалтын медианыг олно. sd(sales, na.rm = TRUE) – Борлуулалтын стандарт хазайлт (өргөн тархалттай эсэх). mean(margin, na.rm = TRUE) – Нийт ашгийн харьцааны дундаж утга. mean(hoursw, na.rm = TRUE) – Нийт ажилласан цагийн дундаж.

library(ggplot2)
ggplot(clothing, aes(x = as.factor(nown))) +
  geom_bar(fill = "orange", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Number of Owners per Store", x = "Number of Owners", y = "Count")

library(reshape2)

## Warning: package 'reshape2' was built under R version 4.4.2

clothing_workers <- clothing[, c("nfull", "npart")]
clothing_workers_melted <- melt(clothing_workers)

## No id variables; using all as measure variables

ggplot(clothing_workers_melted, aes(x = variable, y = value)) +
  geom_bar(stat = "summary", fun = mean, fill = c("palevioletred2", "lightblue"), alpha = 0.7) +
  theme_minimal() +
  labs(title = "Average Number of Full-Time vs. Part-Time Workers", x = "Worker Type", y = "Average Count")

# Жилийн борлуулалтын хистограм

ggplot(clothing, aes(x = tsales)) +
  geom_histogram(fill = "orchid4", bins = 30, alpha = 0.7) +
  labs(title = "Distribution of Total Sales", x = "Total Sales", y = "Count")

# Boxplot of sales per square meter
ggplot(clothing, aes(y = sales)) +
  geom_boxplot(fill = "lightblue", alpha = 0.6) +
  labs(title = "Boxplot of Sales per Square Meter", y = "Sales per Square Meter")

library(ggplot2)
ggplot(clothing, aes(x = sales)) +
  geom_density(fill = "pink", alpha = 0.5) +  
  geom_rug(color = "purple", alpha = 0.8) + 
  theme_minimal() +
  labs(title = "Sales Distribution", x = "Sales", y = "Density")

ggplot(clothing, aes(y = margin)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  theme_minimal() +
  labs(title = "Profit Margin Boxplot", y = "Margin")

ggplot(clothing, aes(x = ssize, y = sales)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", color = "red") +
  theme_minimal() +
  labs(title = "Sales vs. Store Size", x = "Store Size (m²)", y = "Sales per m²")

## `geom_smooth()` using formula = 'y ~ x'

clothing_investments <- clothing[, c("inv1", "inv2")]
clothing_investments_melted <- melt(clothing_investments)

## No id variables; using all as measure variables

ggplot(clothing_investments_melted, aes(x = variable, y = value)) +
  geom_bar(stat = "summary", fun = mean, fill = c("green", "blue"), alpha = 0.7) +
  theme_minimal() +
  labs(title = "Average Investments: Shop vs. Automation", x = "Investment Type", y = "Average Amount")

ggplot(clothing, aes(x = start)) +
  geom_histogram(binwidth = 5, fill = "darkgreen", color = "white", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Business Start Year Distribution", x = "Year", y = "Number of Stores")

clothing$more_part_timers <- ifelse(clothing$npart > clothing$nfull, "More Part-Timers", "More Full-Timers")

ggplot(clothing, aes(x = more_part_timers, fill = more_part_timers)) +
  geom_bar(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stores with More Part-Timers vs. Full-Timers", x = "Worker Type Majority", y = "Count")

ggplot(clothing, aes(x = "", y = sales)) +
  geom_violin(fill = "lightblue", alpha = 0.6) +  
  geom_boxplot(width = 0.2, fill = "white", color = "blue") +  
  geom_jitter(color = "darkblue", alpha = 0.4, width = 0.1) +  
  theme_minimal() +
  labs(title = "Raincloud Plot: Sales Distribution", y = "Sales")

clothing$margin_category <- cut(clothing$margin, breaks = c(0, 20, 40, 100), 
                                labels = c("Low", "Medium", "High"))

ggplot(clothing, aes(x = margin_category, fill = margin_category)) +
  geom_bar(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stores Classified by Profit Margin", x = "Profit Margin Category", y = "Count")

library(waffle)

## Warning: package 'waffle' was built under R version 4.4.2

library(RColorBrewer)  

df_nfull <- round(table(clothing$nfull) / sum(table(clothing$nfull)) * 100)

colors <- brewer.pal(min(length(df_nfull), 13), "Set3")

waffle(df_nfull, rows = 10, colors = colors) +
  labs(title = "Proportion of Full-time Employees Across Stores")

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.2

## corrplot 0.95 loaded

# Зөвхөн тоон хувьсагчдыг сонгож корреляци тооцоолох
cor_matrix <- cor(clothing[, c("tsales", "sales", "margin", "nown", "nfull", "npart", "naux", "hoursw")], use = "complete.obs")

# Корреляцийн матриц хэвлэх
print(cor_matrix)

##           tsales       sales     margin        nown      nfull      npart
## tsales 1.0000000  0.46988817  0.2409816 0.128367110 0.56500853 0.39084445
## sales  0.4698882  1.00000000  0.1373499 0.147894035 0.23718542 0.05008504
## margin 0.2409816  0.13734990  1.0000000 0.052375501 0.10944391 0.18373028
## nown   0.1283671  0.14789404  0.0523755 1.000000000 0.06210814 0.05710353
## nfull  0.5650085  0.23718542  0.1094439 0.062108141 1.00000000 0.28879875
## npart  0.3908444  0.05008504  0.1837303 0.057103526 0.28879875 1.00000000
## naux   0.1810007 -0.01429326 -0.1013364 0.006628208 0.08422274 0.03732837
## hoursw 0.7091963  0.26299664  0.2960733 0.402467525 0.53131829 0.24908371
##                naux    hoursw
## tsales  0.181000688 0.7091963
## sales  -0.014293261 0.2629966
## margin -0.101336417 0.2960733
## nown    0.006628208 0.4024675
## nfull   0.084222744 0.5313183
## npart   0.037328366 0.2490837
## naux    1.000000000 0.2096814
## hoursw  0.209681386 1.0000000

# Корреляцийн график
corrplot(cor_matrix, method = "color", type = "upper")

model1 <- lm(sales ~ ssize, data = clothing)
summary(model1)

## 
## Call:
## lm(formula = sales ~ ssize, data = clothing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6071.3 -2194.5  -813.2  1139.8 20166.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7809.808    299.817  26.049  < 2e-16 ***
## ssize         -9.765      1.593  -6.132  2.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3579 on 398 degrees of freedom
## Multiple R-squared:  0.08631,    Adjusted R-squared:  0.08402 
## F-statistic:  37.6 on 1 and 398 DF,  p-value: 2.097e-09

lm(sales ~ ssize, data = clothing) – Борлуулалт (sales) ба дэлгүүрийн талбайн хэмжээ (ssize) хоорондын хамаарлыг үнэлнэ. summary(model1) – R-squared, p-value, Coefficients зэрэг статистикуудыг өгнө.

lm(sales ~ ssize, data = clothing)

## 
## Call:
## lm(formula = sales ~ ssize, data = clothing)
## 
## Coefficients:
## (Intercept)        ssize  
##    7809.808       -9.765

summary(model1)

## 
## Call:
## lm(formula = sales ~ ssize, data = clothing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6071.3 -2194.5  -813.2  1139.8 20166.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7809.808    299.817  26.049  < 2e-16 ***
## ssize         -9.765      1.593  -6.132  2.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3579 on 398 degrees of freedom
## Multiple R-squared:  0.08631,    Adjusted R-squared:  0.08402 
## F-statistic:  37.6 on 1 and 398 DF,  p-value: 2.097e-09

Борлуулалт (sales) ба дэлгүүрийн талбайн хэмжээ (ssize) хоорондын хамаарлыг үнэлнэ. R-squared, p-value, Coefficients зэрэг статистикуудыг өгнө.

ggplot(clothing, aes(x = start, y = tsales)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  theme_minimal() +
  labs(title = "Annual Sales vs. Store Start Year", x = "Start Year", y = "Annual Sales")

## `geom_smooth()` using formula = 'y ~ x'

clothing_scaled <- scale(clothing[, c("tsales", "sales", "margin", "ssize")])
kmeans_result <- kmeans(clothing_scaled, centers = 3)

clothing$cluster <- as.factor(kmeans_result$cluster)

ggplot(clothing, aes(x = ssize, y = tsales, color = cluster)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Store Clusters: Size vs. Sales", x = "Store Size", y = "Annual Sales")

ggplot(clothing, aes(x = ssize, y = sales)) +
  geom_bin2d() +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Sales Density by Store Size", x = "Store Size (m²)", y = "Sales per m²")

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.2

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

model_rf <- randomForest(tsales ~ sales + margin + ssize + nfull + npart + inv1 + inv2, data = clothing, importance = TRUE)

importance(model_rf)

##          %IncMSE IncNodePurity
## sales  33.970165  3.185523e+13
## margin  4.335011  7.600449e+12
## ssize  36.057816  3.357273e+13
## nfull  26.341345  3.543838e+13
## npart   8.967630  1.056988e+13
## inv1    7.700207  4.497323e+12
## inv2    2.633371  6.218003e+12

varImpPlot(model_rf)

library(moments)

skewness(clothing$tsales, na.rm = TRUE)

## [1] 2.380106

kurtosis(clothing$tsales, na.rm = TRUE)

## [1] 13.16138

skewness(clothing$margin, na.rm = TRUE)

## [1] -0.3475286

kurtosis(clothing$margin, na.rm = TRUE)

## [1] 6.726451

R markdown Enkhjin

TS. Enkhjin

2025-02-16