Корреляционный и регрессионный анализ

Базовая статистика по переменным

Данные представляют собой квартальные наблюдения с 2018 по 2024 год. Ниже представлено описание переменных:

variables_description <- data.frame(
  Переменная = c("year", "quarter", "gdp", "m2", "flow", "n", "n_re", "n_pure", "n_pure_re", "market", "warehouses"),
  Описание = c("Год наблюдения", "Квартал наблюдения", "Валовой внутренний продукт России", 
               "Денежная масса (агрегат M2)", "Объем средств, привлеченных в ЗПИФ", 
               "Общее количество ЗПИФ", "Количество ЗПИФ недвижимости", 
               "Стоимость чистых активов ЗПИФ", 
               "Стоимость чистых активов ЗПИФ недвижимости", 
               "Объем рынка коммерческой недвижимости", 
               "Объем рынка складской недвижимости"),
  Единицы_измерения = c("Год", "Квартал (1-4)", "млрд руб.", "млрд руб.", "млрд руб.", 
                        "единиц", "единиц", "единиц", "млн руб.", "млн кв.м.", "млн кв.м.")
)

# Вывод таблицы с описанием переменных
kable(variables_description, caption = "Описание переменных")
Описание переменных
Переменная Описание Единицы_измерения
year Год наблюдения Год
quarter Квартал наблюдения Квартал (1-4)
gdp Валовой внутренний продукт России млрд руб.
m2 Денежная масса (агрегат M2) млрд руб.
flow Объем средств, привлеченных в ЗПИФ млрд руб.
n Общее количество ЗПИФ единиц
n_re Количество ЗПИФ недвижимости единиц
n_pure Стоимость чистых активов ЗПИФ единиц
n_pure_re Стоимость чистых активов ЗПИФ недвижимости млн руб.
market Объем рынка коммерческой недвижимости млн кв.м.
warehouses Объем рынка складской недвижимости млн кв.м.
df_clean <- df %>%
  mutate(across(where(is.character), as.numeric))
df_clean$yearqtr <- paste(df_clean$year, df_clean$quarter, sep = "Q")

summary(df_clean)
##       year         quarter          gdp              m2        
##  Min.   :2018   Min.   :1.00   Min.   :22474   Min.   : 42046  
##  1st Qu.:2019   1st Qu.:1.75   1st Qu.:27492   1st Qu.: 48793  
##  Median :2021   Median :2.50   Median :32026   Median : 59506  
##  Mean   :2021   Mean   :2.50   Mean   :34198   Mean   : 67053  
##  3rd Qu.:2023   3rd Qu.:3.25   3rd Qu.:40126   3rd Qu.: 84516  
##  Max.   :2024   Max.   :4.00   Max.   :50814   Max.   :111025  
##                                NA's   :1                       
##       flow               n             n_re           n_pure        
##  Min.   :  -6.00   Min.   : 850   Min.   : 33.0   Min.   : 2329650  
##  1st Qu.:  65.05   1st Qu.:1014   1st Qu.: 50.0   1st Qu.: 3299789  
##  Median : 172.90   Median :1293   Median : 95.0   Median : 5330049  
##  Mean   : 251.79   Mean   :1411   Mean   :107.7   Mean   : 5902585  
##  3rd Qu.: 305.10   3rd Qu.:1638   3rd Qu.:160.0   3rd Qu.: 7112619  
##  Max.   :1530.00   Max.   :2593   Max.   :239.0   Max.   :13927461  
##  NA's   :1         NA's   :1      NA's   :1       NA's   :1         
##    n_pure_re             market        warehouses      yearqtr         
##  Min.   :    52175   Min.   :2.300   Min.   :0.700   Length:28         
##  1st Qu.:206745242   1st Qu.:2.850   1st Qu.:0.920   Class :character  
##  Median :411358775   Median :3.600   Median :1.000   Mode  :character  
##  Mean   :433563593   Mean   :3.589   Mean   :1.269                     
##  3rd Qu.:650819008   3rd Qu.:4.700   3rd Qu.:1.700                     
##  Max.   :977086593   Max.   :4.700   Max.   :2.100                     
##  NA's   :1           NA's   :1       NA's   :1

Исследование корреляций между переменными

# Вычисление корреляционной матрицы
correlation_matrix <- cor(df_clean[, c("gdp", "m2", "flow", "n", "n_re", "n_pure", "n_pure_re", 
                                      "market", "warehouses")], 
                        use = "complete.obs")

# Тепловая карта корреляций 
melted_corr <- melt(correlation_matrix)
colnames(melted_corr) <- c("Var1", "Var2", "value")
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#6D9EC1", high = "#E46726", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = "Корреляция\nПирсона") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  coord_fixed() +
  labs(title = "Тепловая карта корреляций",
       x = "", y = "")

# Полная визуализация с диаграммами рассеяния и гистограммами
ggpairs(df_clean[, c("gdp", "m2", "flow", "n", "n_re", "n_pure", "n_pure_re", 
                    "market", "warehouses")],
       upper = list(continuous = "cor", combo = "box_no_facet"),
       lower = list(continuous = "smooth", combo = "dot_no_facet"),
       diag = list(continuous = "barDiag"),
       title = "Матрица корреляций Пирсона с диаграммами рассеяния",
       axisLabels = "show",
       switch = "both") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        axis.text.y = element_text(size = 8),
        strip.text = element_text(size = 8))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

На основе матрицы можно сделать следующие выводы:

Наблюдается высокая положительная корреляция между flow и gdp (0.69) -Средняя положительная корреляция между flow и m2 (0.54)

-Умеренная положительная корреляция между flow и общим количеством ЗПИФ (n) (0.56)

-Средняя положительная корреляция между flow и стоимостью чистых активов ЗПИФ (n_pure) (0.57)

# 1. Базовая модель с макроэкономическими переменными
model1 <- lm(flow ~ gdp + m2, data = df_clean)
summary(model1)
## 
## Call:
## lm(formula = flow ~ gdp + m2, data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -382.96 -124.10  -12.79  121.81  761.32 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -7.794e+02  2.082e+02  -3.743  0.00106 **
## gdp          4.679e-02  1.440e-02   3.250  0.00353 **
## m2          -8.961e-03  5.939e-03  -1.509  0.14497   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 231.2 on 23 degrees of freedom
##   (2 пропущенных наблюдений удалены)
## Multiple R-squared:  0.5257, Adjusted R-squared:  0.4845 
## F-statistic: 12.75 on 2 and 23 DF,  p-value: 0.0001881
# 2. Модель с добавлением количественных характеристик ЗПИФ
model2 <- lm(flow ~ gdp + m2 + n + n_re, data = df_clean)
summary(model2)
## 
## Call:
## lm(formula = flow ~ gdp + m2 + n + n_re, data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -383.72 -140.22  -44.16  117.76  721.66 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -1.353e+03  6.806e+02  -1.988   0.0606 . 
## gdp          6.043e-02  1.900e-02   3.180   0.0047 **
## m2           8.677e-03  1.885e-02   0.460   0.6502   
## n           -8.679e-01  9.163e-01  -0.947   0.3548   
## n_re         9.742e-01  1.778e+00   0.548   0.5897   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 239 on 20 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.5522, Adjusted R-squared:  0.4626 
## F-statistic: 6.165 on 4 and 20 DF,  p-value: 0.002116
# 3. Модель с добавлением показателей стоимости активов
model3 <- lm(flow ~ gdp + m2 + n + n_re + n_pure + n_pure_re, data = df_clean)
summary(model3)
## 
## Call:
## lm(formula = flow ~ gdp + m2 + n + n_re + n_pure + n_pure_re, 
##     data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -258.04 -154.66  -55.03   76.51  573.73 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -1.650e+02  9.185e+02  -0.180  0.85941   
## gdp          7.340e-02  1.877e-02   3.910  0.00103 **
## m2          -7.177e-03  2.145e-02  -0.335  0.74184   
## n           -1.577e+00  1.546e+00  -1.020  0.32135   
## n_re        -1.278e+00  1.985e+00  -0.644  0.52779   
## n_pure       1.397e-04  1.254e-04   1.114  0.27992   
## n_pure_re   -3.485e-07  2.174e-07  -1.603  0.12639   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 223.7 on 18 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.6468, Adjusted R-squared:  0.529 
## F-statistic: 5.493 on 6 and 18 DF,  p-value: 0.002195
# 4. Полная модель с показателями рынка недвижимости
model4 <- lm(flow ~ gdp + m2 + n + n_re + n_pure + n_pure_re + market + warehouses, data = df_clean)
summary(model4)
## 
## Call:
## lm(formula = flow ~ gdp + m2 + n + n_re + n_pure + n_pure_re + 
##     market + warehouses, data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -304.22  -94.69  -55.78  116.21  504.64 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.428e+03  1.527e+03   0.935  0.36351   
## gdp          6.242e-02  1.935e-02   3.225  0.00529 **
## m2           1.343e-03  2.774e-02   0.048  0.96199   
## n           -2.981e+00  1.809e+00  -1.648  0.11883   
## n_re        -3.787e+00  3.021e+00  -1.253  0.22804   
## n_pure       3.495e-04  1.716e-04   2.036  0.05860 . 
## n_pure_re   -2.856e-07  2.615e-07  -1.092  0.29096   
## market      -3.360e+01  2.190e+02  -0.153  0.87997   
## warehouses  -5.381e+02  5.893e+02  -0.913  0.37474   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 216 on 16 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.7073, Adjusted R-squared:  0.5609 
## F-statistic: 4.832 on 8 and 16 DF,  p-value: 0.00362
# 5. Модель с выбранными переменными на основе корреляционной матрицы
# Выбираем переменные с наибольшей корреляцией с flow
model5 <- lm(flow ~ gdp + n + n_pure, data = df_clean)
summary(model5)
## 
## Call:
## lm(formula = flow ~ gdp + n + n_pure, data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -287.23 -153.80  -54.45  116.45  638.65 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -5.166e+02  4.041e+02  -1.278  0.21509   
## gdp          6.323e-02  1.735e-02   3.643  0.00152 **
## n           -1.646e+00  8.286e-01  -1.987  0.06014 . 
## n_pure       1.451e-04  1.018e-04   1.425  0.16894   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 224.8 on 21 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.5839, Adjusted R-squared:  0.5245 
## F-statistic: 9.823 on 3 and 21 DF,  p-value: 0.0002991
# 6. Простая модель с одной переменной, имеющей наивысшую корреляцию (gdp)
model6 <- lm(flow ~ gdp, data = df_clean)
summary(model6)
## 
## Call:
## lm(formula = flow ~ gdp, data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -364.69 -130.08  -11.94   89.86  848.99 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.779e+02  2.022e+02  -3.352  0.00265 ** 
## gdp          2.674e-02  5.695e-03   4.695 9.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 237.2 on 24 degrees of freedom
##   (2 пропущенных наблюдений удалены)
## Multiple R-squared:  0.4788, Adjusted R-squared:  0.4571 
## F-statistic: 22.05 on 1 and 24 DF,  p-value: 9.011e-05
# 7. Проверка на мультиколлинеарность для полной модели
if(requireNamespace("car", quietly = TRUE)) {
  vif_result <- vif(model4)
  print("Факторы инфляции дисперсии (VIF):")
  print(vif_result)
}
## [1] "Факторы инфляции дисперсии (VIF):"
##        gdp         m2          n       n_re     n_pure  n_pure_re     market 
##  12.706962 158.281222 348.469934  19.148616 148.784236   2.970638  18.487923 
## warehouses 
##  37.708428
# 8. Логарифмические трансформации
min_flow <- min(df_clean$flow, na.rm = TRUE)
if(min_flow <= 0) {
  df_clean$log_flow <- log(df_clean$flow - min_flow + 1)
} else {
  df_clean$log_flow <- log(df_clean$flow)
}

df_clean$log_gdp <- log(df_clean$gdp)
df_clean$log_m2 <- log(df_clean$m2)
df_clean$log_n <- log(df_clean$n)
df_clean$log_n_pure <- log(df_clean$n_pure)

# 9. Логарифмическая модель с ключевыми переменными
log_model <- lm(log_flow ~ log_gdp + log_m2 + log_n + log_n_pure, data = df_clean)
summary(log_model)
## 
## Call:
## lm(formula = log_flow ~ log_gdp + log_m2 + log_n + log_n_pure, 
##     data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0722 -0.5425  0.1074  0.6794  1.7226 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -77.6033    45.7325  -1.697  0.10523   
## log_gdp       2.4245     3.1958   0.759  0.45691   
## log_m2        0.5816     6.6653   0.087  0.93134   
## log_n       -21.9279    10.4177  -2.105  0.04815 * 
## log_n_pure   13.5001     4.0953   3.296  0.00361 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.123 on 20 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.5102, Adjusted R-squared:  0.4123 
## F-statistic: 5.209 on 4 and 20 DF,  p-value: 0.004845
# 10. Диагностика для логарифмической модели
par(mfrow=c(2,2))
plot(log_model)

# 11. Тесты на гетероскедастичность и нормальность остатков
if(requireNamespace("lmtest", quietly = TRUE) && requireNamespace("tseries", quietly = TRUE)) {
  # Тест Бреуша-Пагана(на гетероскедастичность)
  bp_test <- lmtest::bptest(log_model)
  print("Тест Бреуша-Пагана на гетероскедастичность:")
  print(bp_test)
  
  # Тест Шапиро-Уилка(на нормальность остатков)
  sw_test <- shapiro.test(residuals(log_model))
  print("Тест Шапиро-Уилка на нормальность остатков:")
  print(sw_test)
}
## [1] "Тест Бреуша-Пагана на гетероскедастичность:"
## 
##  studentized Breusch-Pagan test
## 
## data:  log_model
## BP = 10.704, df = 4, p-value = 0.0301
## 
## [1] "Тест Шапиро-Уилка на нормальность остатков:"
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(log_model)
## W = 0.9452, p-value = 0.195
# 12. Пошаговое построение модели с помощью функции step
step_model <- step(model4, direction = "both", trace = TRUE)
## Start:  AIC=275.61
## flow ~ gdp + m2 + n + n_re + n_pure + n_pure_re + market + warehouses
## 
##              Df Sum of Sq     RSS    AIC
## - m2          1       109  746798 273.62
## - market      1      1099  747787 273.65
## - warehouses  1     38909  785598 274.88
## - n_pure_re   1     55660  802348 275.41
## <none>                     746688 275.61
## - n_re        1     73325  820013 275.95
## - n           1    126761  873450 277.53
## - n_pure      1    193542  940230 279.38
## - gdp         1    485522 1232210 286.14
## 
## Step:  AIC=273.62
## flow ~ gdp + n + n_re + n_pure + n_pure_re + market + warehouses
## 
##              Df Sum of Sq     RSS    AIC
## - market      1      2595  749393 271.70
## - n_pure_re   1     55594  802392 273.41
## - warehouses  1     61639  808437 273.60
## <none>                     746798 273.62
## - n_re        1     99712  846510 274.75
## + m2          1       109  746688 275.61
## - n_pure      1    221373  968171 278.11
## - n           1    280248 1027046 279.58
## - gdp         1    497868 1244666 284.39
## 
## Step:  AIC=271.7
## flow ~ gdp + n + n_re + n_pure + n_pure_re + warehouses
## 
##              Df Sum of Sq     RSS    AIC
## <none>                     749393 271.70
## - n_re        1    114857  864250 273.27
## + market      1      2595  746798 273.62
## + m2          1      1606  747787 273.65
## - warehouses  1    157179  906572 274.46
## - n_pure_re   1    168130  917523 274.76
## - n_pure      1    218813  968206 276.11
## - n           1    287157 1036550 277.81
## - gdp         1    510392 1259785 282.69
summary(step_model)
## 
## Call:
## lm(formula = flow ~ gdp + n + n_re + n_pure + n_pure_re + warehouses, 
##     data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -301.34  -98.73  -28.75  114.58  511.80 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.292e+03  1.126e+03   1.147  0.26634   
## gdp          6.272e-02  1.791e-02   3.501  0.00255 **
## n           -2.838e+00  1.081e+00  -2.626  0.01713 * 
## n_re        -3.534e+00  2.127e+00  -1.661  0.11404   
## n_pure       3.428e-04  1.495e-04   2.293  0.03414 * 
## n_pure_re   -3.313e-07  1.649e-07  -2.010  0.05971 . 
## warehouses  -5.943e+02  3.058e+02  -1.943  0.06782 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 204 on 18 degrees of freedom
##   (3 пропущенных наблюдений удалены)
## Multiple R-squared:  0.7062, Adjusted R-squared:  0.6083 
## F-statistic: 7.211 on 6 and 18 DF,  p-value: 0.000486
# 13. Модель с учетом сезонности (квартальные данные)
model_seasonal <- lm(flow ~ gdp + m2 + factor(quarter), data = df_clean)
summary(model_seasonal)
## 
## Call:
## lm(formula = flow ~ gdp + m2 + factor(quarter), data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -330.96 -149.47  -16.08  127.54  694.69 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      -7.166e+02  2.330e+02  -3.076  0.00596 **
## gdp               3.922e-02  2.209e-02   1.775  0.09107 . 
## m2               -5.818e-03  8.818e-03  -0.660  0.51690   
## factor(quarter)2 -2.344e+01  1.346e+02  -0.174  0.86353   
## factor(quarter)3 -9.244e+01  1.513e+02  -0.611  0.54824   
## factor(quarter)4  9.732e+01  1.962e+02   0.496  0.62530   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 237.3 on 20 degrees of freedom
##   (2 пропущенных наблюдений удалены)
## Multiple R-squared:  0.5655, Adjusted R-squared:  0.4569 
## F-statistic: 5.206 on 5 and 20 DF,  p-value: 0.003207