knitr - tabuľka
library(dplyr)
library(knitr)
traffic_waiting_stats <- log_data %>%
group_by(traffic_status) %>%
summarise(
n = n(),
mean = mean(waiting_time, na.rm = TRUE),
sd = sd(waiting_time, na.rm = TRUE),
min = min(waiting_time, na.rm = TRUE),
q25 = quantile(waiting_time, 0.25, na.rm = TRUE),
median = median(waiting_time, na.rm = TRUE),
q75 = quantile(waiting_time, 0.75, na.rm = TRUE),
max = max(waiting_time, na.rm = TRUE),
.groups = "drop"
)
kable(traffic_waiting_stats, digits = 2, caption = "Descriptive statistics of Waiting Time by Traffic Status")
Descriptive statistics of Waiting Time by Traffic
Status
| Clear |
328 |
35.54 |
14.94 |
10 |
22 |
35 |
49 |
60 |
| Detour |
345 |
35.31 |
13.84 |
10 |
24 |
35 |
47 |
60 |
| Heavy |
327 |
34.32 |
14.68 |
10 |
22 |
34 |
49 |
60 |
alebo krajšie tabuľky s pomocou .kableExtra.:
library(dplyr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
waiting_stats <- log_data %>%
summarise(
n = n(),
mean = mean(waiting_time, na.rm = TRUE),
sd = sd(waiting_time, na.rm = TRUE),
min = min(waiting_time, na.rm = TRUE),
q25 = quantile(waiting_time, 0.25, na.rm = TRUE),
median = median(waiting_time, na.rm = TRUE),
q75 = quantile(waiting_time, 0.75, na.rm = TRUE),
max = max(waiting_time, na.rm = TRUE)
)
waiting_stats %>%
kable(digits = 2, caption = "Basic statistics of Waiting Time") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "condensed", "hover")) %>%
add_header_above(c("Waiting Time statistics" = 8))
Basic statistics of Waiting Time
|
Waiting Time statistics
|
|
n
|
mean
|
sd
|
min
|
q25
|
median
|
q75
|
max
|
|
1000
|
35.06
|
14.48
|
10
|
23
|
35
|
49
|
60
|
Main Anylize Logisitcs
library(dplyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
log_data <- read.csv(
"smart_logistics_dataset.csv",
header = TRUE,
sep = ",",
dec = ".",
stringsAsFactors = FALSE
)
# Очистка назв
log_data <- log_data %>% clean_names()
names(log_data) <- tolower(names(log_data))
names(log_data) <- gsub(" ", "_", names(log_data))
head(log_data)
str(log_data)
## 'data.frame': 1000 obs. of 16 variables:
## $ timestamp : chr "2024-03-20 00:11:14" "2024-10-30 07:53:51" "2024-07-29 18:42:48" "2024-10-28 00:50:54" ...
## $ asset_id : chr "Truck_7" "Truck_6" "Truck_10" "Truck_9" ...
## $ latitude : num -65.7 22.3 54.9 42.4 -65.8 ...
## $ longitude : num 11.25 -131.71 79.55 -1.48 47.95 ...
## $ inventory_level : int 390 491 190 330 480 118 480 222 245 389 ...
## $ shipment_status : chr "Delayed" "In Transit" "In Transit" "Delivered" ...
## $ temperature : num 27 22.5 25.2 25.4 20.5 24.3 20.7 23.3 26.4 21.9 ...
## $ humidity : num 67.8 54.3 62.2 52.3 57.2 61.8 75.4 64.2 77.2 57.3 ...
## $ traffic_status : chr "Detour" "Heavy" "Detour" "Heavy" ...
## $ waiting_time : int 38 16 34 37 56 56 32 30 14 52 ...
## $ user_transaction_amount: int 320 439 355 227 197 258 263 459 183 127 ...
## $ user_purchase_frequency: int 4 7 3 5 6 10 3 9 2 7 ...
## $ logistics_delay_reason : chr "None" "Weather" "None" "Traffic" ...
## $ asset_utilization : num 60.1 80.9 99.2 97.4 71.6 66.8 73.3 73.8 69.6 63.1 ...
## $ demand_forecast : int 285 174 260 160 270 189 198 253 206 224 ...
## $ logistics_delay : int 1 1 0 1 1 0 1 0 0 0 ...
Testovanie hypotéz
# Testovanie hypotéz
#### t-test: Porovnanie waiting_time pre Clear a Heavy
t_test_result <- t.test(
log_data$waiting_time[log_data$traffic_status == "Clear"],
log_data$waiting_time[log_data$traffic_status == "Heavy"]
)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: log_data$waiting_time[log_data$traffic_status == "Clear"] and log_data$waiting_time[log_data$traffic_status == "Heavy"]
## t = 1.0554, df = 652.86, p-value = 0.2916
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.051269 3.494433
## sample estimates:
## mean of x mean of y
## 35.54268 34.32110
ANOVA: Comparing Reading Scores Across Programs
anova_result <- aov(waiting_time ~ traffic_status, data = log_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## traffic_status 2 276 138.0 0.658 0.518
## Residuals 997 209120 209.8
Linear Regression: Predicting Math Scores
library(dplyr)
names(log_data)
## [1] "timestamp" "asset_id"
## [3] "latitude" "longitude"
## [5] "inventory_level" "shipment_status"
## [7] "temperature" "humidity"
## [9] "traffic_status" "waiting_time"
## [11] "user_transaction_amount" "user_purchase_frequency"
## [13] "logistics_delay_reason" "asset_utilization"
## [15] "demand_forecast" "logistics_delay"
str(log_data)
## 'data.frame': 1000 obs. of 16 variables:
## $ timestamp : chr "2024-03-20 00:11:14" "2024-10-30 07:53:51" "2024-07-29 18:42:48" "2024-10-28 00:50:54" ...
## $ asset_id : chr "Truck_7" "Truck_6" "Truck_10" "Truck_9" ...
## $ latitude : num -65.7 22.3 54.9 42.4 -65.8 ...
## $ longitude : num 11.25 -131.71 79.55 -1.48 47.95 ...
## $ inventory_level : int 390 491 190 330 480 118 480 222 245 389 ...
## $ shipment_status : chr "Delayed" "In Transit" "In Transit" "Delivered" ...
## $ temperature : num 27 22.5 25.2 25.4 20.5 24.3 20.7 23.3 26.4 21.9 ...
## $ humidity : num 67.8 54.3 62.2 52.3 57.2 61.8 75.4 64.2 77.2 57.3 ...
## $ traffic_status : chr "Detour" "Heavy" "Detour" "Heavy" ...
## $ waiting_time : int 38 16 34 37 56 56 32 30 14 52 ...
## $ user_transaction_amount: int 320 439 355 227 197 258 263 459 183 127 ...
## $ user_purchase_frequency: int 4 7 3 5 6 10 3 9 2 7 ...
## $ logistics_delay_reason : chr "None" "Weather" "None" "Traffic" ...
## $ asset_utilization : num 60.1 80.9 99.2 97.4 71.6 66.8 73.3 73.8 69.6 63.1 ...
## $ demand_forecast : int 285 174 260 160 270 189 198 253 206 224 ...
## $ logistics_delay : int 1 1 0 1 1 0 1 0 0 0 ...
log_data$logistics_delay <- as.factor(log_data$logistics_delay)
model_logit <- glm(
logistics_delay ~ waiting_time +
traffic_status +
asset_utilization +
demand_forecast +
temperature +
humidity +
user_transaction_amount,
data = log_data,
family = binomial
)
summary(model_logit)
##
## Call:
## glm(formula = logistics_delay ~ waiting_time + traffic_status +
## asset_utilization + demand_forecast + temperature + humidity +
## user_transaction_amount, family = binomial, data = log_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.470e+00 1.108e+00 1.327 0.185
## waiting_time -8.594e-03 5.661e-03 -1.518 0.129
## traffic_statusDetour 4.734e-02 1.622e-01 0.292 0.770
## traffic_statusHeavy 2.021e+01 5.917e+02 0.034 0.973
## asset_utilization -5.931e-03 7.003e-03 -0.847 0.397
## demand_forecast -1.515e-05 1.365e-03 -0.011 0.991
## temperature -3.445e-02 2.455e-02 -1.404 0.160
## humidity -1.027e-02 9.426e-03 -1.090 0.276
## user_transaction_amount 5.642e-04 6.836e-04 0.825 0.409
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1368.82 on 999 degrees of freedom
## Residual deviance: 868.49 on 991 degrees of freedom
## AIC: 886.49
##
## Number of Fisher Scoring iterations: 18
Main Program
log_data <- read.csv("smart_logistics_dataset.csv", header = TRUE, stringsAsFactors = FALSE)
names(log_data) <- tolower(names(log_data))
names(log_data) <- gsub(" ", "_", names(log_data))
log_data$traffic_status <- as.factor(log_data$traffic_status)
log_data$shipment_status <- as.factor(log_data$shipment_status)
log_data$logistics_delay_reason <- as.factor(log_data$logistics_delay_reason)
log_data$asset_id <- as.factor(log_data$asset_id)
log_data$logistics_delay <- as.factor(log_data$logistics_delay)