Aplicação de modelos de regressão múltipla para identificação de fatores críticos no desempenho de atletas de golfe

Introdução

Este relatório apresenta a análise de fatores que influenciam o desempenho dos golfistas, com base no dataset fornecido. O objetivo é identificar as variáveis mais importantes para a colocação final dos atletas (FINISH).

Importação e preparação dos dados

Este conjunto de dados foi retirado de https://www.kaggle.com/datasets/felipepedraza02/open-championship-data. Antes da importação dos dados foi feita uma seleção manual das variáveis de interesse.

#Importação dos dados
data <- read_excel("C:/Users/felip/Desktop/MBA Data Science/archive/dadoslimpos.xlsx")

# Identificar colunas que contêm o símbolo `%`
colunas_com_percentual <- sapply(data, function(coluna) any(grepl("%", coluna)))

# Converter apenas as colunas com `%` e remover o símbolo
data[colunas_com_percentual] <- lapply(data[colunas_com_percentual], function(x) {
  as.numeric(gsub("%", "", x)) / 100
})

# Identificar colunas numéricas armazenadas como texto (excluindo coluna PLAYER)
colunas_a_converter <- sapply(data, function(coluna) {
  if (is.character(coluna)) {
    all(!is.na(as.numeric(gsub(",", ".", coluna))), na.rm = TRUE)
  } else {
    FALSE
  }
})

# Converter vírgulas para pontos apenas nas colunas identificadas
data[colunas_a_converter] <- lapply(data[colunas_a_converter], function(coluna) {
  as.numeric(gsub(",", ".", coluna))
})

# Visualizar o resultado
print(data)

## # A tibble: 644 × 22
##     Year PLAYER    FINISH  OWGR DDist  DAcc   GIR GIRNF   P2H  RP2H APP100 SCRMB
##    <dbl> <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>
##  1  2008 PADRAIG …      1    14  296. 0.594 0.607 0.493  419.  499.   391. 0.610
##  2  2008 IAN POUL…      2    42  284. 0.598 0.598 0.442  430.  513.   408. 0.563
##  3  2008 JIM FURYK      5    12  280. 0.694 0.668 0.5    412.  571.   387. 0.603
##  4  2008 ERNIE ELS      7     5  292. 0.569 0.613 0.445  399.  523.   354. 0.566
##  5  2008 STEPHEN …      7    25  284. 0.627 0.650 0.509  393.  493.   380. 0.587
##  6  2008 STEVE ST…      7     8  284. 0.562 0.638 0.488  420.  533.   396. 0.618
##  7  2008 ROBERT A…      7    30  292. 0.656 0.704 0.546  417.  556.   379. 0.553
##  8  2008 BEN CURT…      7    86  285. 0.672 0.635 0.438  413.  506.   383. 0.592
##  9  2008 PAUL CAS…      7    48  299. 0.624 0.657 0.463  411.  558.   377. 0.520
## 10  2008 ANTHONY …      7    13  301. 0.583 0.658 0.523  427.  546.   382. 0.593
## # ℹ 634 more rows
## # ℹ 10 more variables: RSCRMB <dbl>, SSCRMB <dbl>, SCRMB30 <dbl>, PPR <dbl>,
## #   one_Putt <dbl>, two_Putt <dbl>, three_PuttA <dbl>, BirdieAvg <dbl>,
## #   BogeyA <dbl>, BBack <dbl>

# Exportar o dataset modificado para Excel
write_xlsx(data, "C:/Users/felip/Desktop/MBA Data Science/dataset_modificado.xlsx")

Análise de correlação

# Calcular a matriz de correlação para variáveis numéricas com método Pearson
numeric_vars <- sapply(data, is.numeric)
correlations_pearson <- cor(data[, numeric_vars], use = "complete.obs", method = "pearson")

# Calcular a matriz de correlação para variáveis numéricas com método Spearman
correlations_spearman <- cor(data[, numeric_vars], use = "complete.obs", method = "spearman")

# Comparar correlações de Pearson e de Spearman com FINISH
cor_with_finish_pearson <- correlations_pearson["FINISH", ]
cor_with_finish_spearman <- correlations_spearman["FINISH", ]

# Ordenar as correlações em ordem decrescente
cor_with_finish_pearson_sorted <- sort(cor_with_finish_pearson, decreasing = TRUE)
cor_with_finish_spearman_sorted <- sort(cor_with_finish_spearman, decreasing = TRUE)

# Exibir as variáveis mais correlacionadas com FINISH (Pearson)
print("Correlação de Pearson com FINISH:")

## [1] "Correlação de Pearson com FINISH:"

print(cor_with_finish_pearson_sorted)

##       FINISH       BogeyA         OWGR          PPR  three_PuttA     two_Putt 
##  1.000000000  0.267847572  0.215435842  0.194387245  0.153778881  0.091273252 
##       APP100         DAcc          P2H         RP2H         Year      SCRMB30 
##  0.073543760  0.049835976  0.012675130  0.001721538 -0.064759374 -0.089700819 
##        GIRNF       SSCRMB          GIR        DDist       RSCRMB        BBack 
## -0.100796939 -0.126277067 -0.141501437 -0.164636955 -0.168711017 -0.174465099 
##     one_Putt        SCRMB    BirdieAvg 
## -0.188980730 -0.217652034 -0.277885378

# Exibir as variáveis mais correlacionadas com FINISH (Spearman)
print("Correlação de Spearman com FINISH:")

## [1] "Correlação de Spearman com FINISH:"

print(cor_with_finish_spearman_sorted)

##      FINISH        OWGR      BogeyA         PPR three_PuttA    two_Putt 
##  1.00000000  0.33700359  0.27230038  0.20364944  0.16835951  0.10434539 
##      APP100        DAcc         P2H        RP2H        Year       GIRNF 
##  0.07438513  0.03838379  0.02579832  0.01628659 -0.05795344 -0.09060330 
##     SCRMB30      SSCRMB         GIR      RSCRMB       DDist       BBack 
## -0.12014648 -0.12560576 -0.14008975 -0.15626903 -0.16469255 -0.17182385 
##    one_Putt       SCRMB   BirdieAvg 
## -0.18012230 -0.21884371 -0.27704503

# Criar um dataframe para comparação
comparison_df <- data.frame(
  Variable = names(cor_with_finish_pearson),
  Pearson = cor_with_finish_pearson,
  Spearman = cor_with_finish_spearman
)

write.csv(comparison_df, "C:/Users/felip/Desktop/MBA Data Science/comparison_correlations.csv", row.names = FALSE)

print(comparison_df)

##                Variable      Pearson    Spearman
## Year               Year -0.064759374 -0.05795344
## FINISH           FINISH  1.000000000  1.00000000
## OWGR               OWGR  0.215435842  0.33700359
## DDist             DDist -0.164636955 -0.16469255
## DAcc               DAcc  0.049835976  0.03838379
## GIR                 GIR -0.141501437 -0.14008975
## GIRNF             GIRNF -0.100796939 -0.09060330
## P2H                 P2H  0.012675130  0.02579832
## RP2H               RP2H  0.001721538  0.01628659
## APP100           APP100  0.073543760  0.07438513
## SCRMB             SCRMB -0.217652034 -0.21884371
## RSCRMB           RSCRMB -0.168711017 -0.15626903
## SSCRMB           SSCRMB -0.126277067 -0.12560576
## SCRMB30         SCRMB30 -0.089700819 -0.12014648
## PPR                 PPR  0.194387245  0.20364944
## one_Putt       one_Putt -0.188980730 -0.18012230
## two_Putt       two_Putt  0.091273252  0.10434539
## three_PuttA three_PuttA  0.153778881  0.16835951
## BirdieAvg     BirdieAvg -0.277885378 -0.27704503
## BogeyA           BogeyA  0.267847572  0.27230038
## BBack             BBack -0.174465099 -0.17182385

A partir das comparação chegou-se a conclusão de que a Correlação de Spearman era mais adequeada.

## Converter a matriz de correlação SPEARMAN para o formato "long"
correlations_spearman_melt <- melt(correlations_spearman)

# Criar o gráfico de calor (heatmap) para Spearman
ggplot(data = correlations_spearman_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab",
                       name = "Correlação (Spearman)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  coord_fixed() +
  ggtitle("Matriz de Correlação das Variáveis (Spearman)")

Avaliação da multicolinearidade

# Multicolinearidade 1: Verificar correlação entre variáveis independentes (|cor| > 0.7)
high_corr_pairs <- which(abs(correlations_spearman) > 0.7 & abs(correlations_spearman) < 1, arr.ind = TRUE)

# Exibir pares de variáveis com alta correlação
if (nrow(high_corr_pairs) > 0) {
  print("Pares de variáveis com alta correlação (|cor| > 0.7):")
  for (i in 1:nrow(high_corr_pairs)) {
    print(paste(rownames(correlations_spearman)[high_corr_pairs[i, "row"]],
                colnames(correlations_spearman)[high_corr_pairs[i, "col"]],
                correlations_spearman[high_corr_pairs[i, "row"], high_corr_pairs[i, "col"]],
                sep = " - "))
  }
} else {
  print("Nenhum par de variáveis com alta correlação foi encontrado.")
}

## [1] "Pares de variáveis com alta correlação (|cor| > 0.7):"
## [1] "APP100 - P2H - 0.734941139919699"
## [1] "P2H - APP100 - 0.734941139919699"
## [1] "BogeyA - SCRMB - -0.788960506129052"
## [1] "one_Putt - PPR - -0.940561132174655"
## [1] "two_Putt - PPR - 0.796233607379801"
## [1] "PPR - one_Putt - -0.940561132174655"
## [1] "two_Putt - one_Putt - -0.821622253447881"
## [1] "PPR - two_Putt - 0.796233607379801"
## [1] "one_Putt - two_Putt - -0.821622253447881"
## [1] "SCRMB - BogeyA - -0.788960506129052"

# Multicolinearidade 2: Avaliar a multicolinearidade com VIF
independent_vars <- setdiff(names(data), c("FINISH", "PLAYER"))
formula <- as.formula(paste("FINISH ~", paste(independent_vars, collapse = " + ")))
modelo <- lm(formula, data = data)
vif_values <- vif(modelo)
print("Valores de VIF:")

## [1] "Valores de VIF:"

print(vif_values)

##        Year        OWGR       DDist        DAcc         GIR       GIRNF 
##    2.220449    1.544709    4.461131    3.813031   38.510635    3.108094 
##         P2H        RP2H      APP100       SCRMB      RSCRMB      SSCRMB 
##   10.908545    2.650109    6.854787   38.658197    2.146171    1.813088 
##     SCRMB30         PPR    one_Putt    two_Putt three_PuttA   BirdieAvg 
##    1.376402   47.704703   12.705800    7.248482   15.703796    9.714474 
##      BogeyA       BBack 
##   87.863224    1.519372

# Remover variáveis redundantes
vars_to_remove <- c("P2H", "BogeyA", "one_Putt", "two_Putt")
data_reduced <- data[, !(names(data) %in% vars_to_remove)]
formula_reduced <- as.formula(paste("FINISH ~", paste(setdiff(names(data_reduced), c("FINISH", "PLAYER")), collapse = " + ")))
modelo_reduced <- lm(formula_reduced, data = data_reduced)
vif_reduced <- vif(modelo_reduced)
print("Valores de VIF após redução:")

## [1] "Valores de VIF após redução:"

print(vif_reduced)

##        Year        OWGR       DDist        DAcc         GIR       GIRNF 
##    2.049872    1.329378    4.193518    3.018371   11.450797    3.082175 
##        RP2H      APP100       SCRMB      RSCRMB      SSCRMB     SCRMB30 
##    1.397622    1.433135    5.611281    2.119281    1.789593    1.357656 
##         PPR three_PuttA   BirdieAvg       BBack 
##   16.646685    3.070753    9.328602    1.498434

# Melhorar ainda mais removendo GIR e PPR
vars_to_remove_vif <- c("GIR", "PPR")
data_vif_reduced <- data_reduced[, !(names(data_reduced) %in% vars_to_remove_vif)]
print(data_vif_reduced)

## # A tibble: 644 × 16
##     Year PLAYER  FINISH  OWGR DDist  DAcc GIRNF  RP2H APP100 SCRMB RSCRMB SSCRMB
##    <dbl> <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>
##  1  2008 PADRAI…      1    14  296. 0.594 0.493  499.   391. 0.610  0.604  0.682
##  2  2008 IAN PO…      2    42  284. 0.598 0.442  513.   408. 0.563  0.615  0.549
##  3  2008 JIM FU…      5    12  280. 0.694 0.5    571.   387. 0.603  0.574  0.559
##  4  2008 ERNIE …      7     5  292. 0.569 0.445  523.   354. 0.566  0.505  0.617
##  5  2008 STEPHE…      7    25  284. 0.627 0.509  493.   380. 0.587  0.627  0.544
##  6  2008 STEVE …      7     8  284. 0.562 0.488  533.   396. 0.618  0.672  0.609
##  7  2008 ROBERT…      7    30  292. 0.656 0.546  556.   379. 0.553  0.548  0.474
##  8  2008 BEN CU…      7    86  285. 0.672 0.438  506.   383. 0.592  0.579  0.598
##  9  2008 PAUL C…      7    48  299. 0.624 0.463  558.   377. 0.520  0.462  0.454
## 10  2008 ANTHON…      7    13  301. 0.583 0.523  546.   382. 0.593  0.583  0.547
## # ℹ 634 more rows
## # ℹ 4 more variables: SCRMB30 <dbl>, three_PuttA <dbl>, BirdieAvg <dbl>,
## #   BBack <dbl>

# Reajustar o modelo linear
formula_vif_reduced <- as.formula(paste("FINISH ~", paste(setdiff(names(data_vif_reduced), c("FINISH", "PLAYER")), collapse = " + ")))
modelo_vif_reduced <- lm(formula_vif_reduced, data = data_vif_reduced)

# Recalcular o VIF
vif_reduced_final <- vif(modelo_vif_reduced)
print("VIF após nova redução:")

## [1] "VIF após nova redução:"

print(vif_reduced_final)

##        Year        OWGR       DDist        DAcc       GIRNF        RP2H 
##    2.023265    1.327973    3.848111    1.965644    1.743032    1.377824 
##      APP100       SCRMB      RSCRMB      SSCRMB     SCRMB30 three_PuttA 
##    1.236759    3.728064    2.114948    1.787920    1.357232    1.512457 
##   BirdieAvg       BBack 
##    2.868578    1.490455

# Resumo do modelo
summary(modelo_vif_reduced)

## 
## Call:
## lm(formula = formula_vif_reduced, data = data_vif_reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.720 -17.212  -1.801  15.017  56.800 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.527e+03  4.875e+02  -3.132 0.001819 ** 
## Year         8.869e-01  2.576e-01   3.443 0.000613 ***
## OWGR         2.227e-02  1.282e-02   1.736 0.083035 .  
## DDist       -3.574e-01  1.627e-01  -2.196 0.028441 *  
## DAcc        -6.977e+00  2.307e+01  -0.302 0.762458    
## GIRNF        2.008e+01  2.793e+01   0.719 0.472395    
## RP2H        -1.443e-02  2.902e-02  -0.497 0.619180    
## APP100      -3.178e-02  4.798e-02  -0.662 0.507917    
## SCRMB       -8.361e+01  4.454e+01  -1.877 0.060946 .  
## RSCRMB      -2.945e+01  2.370e+01  -1.242 0.214625    
## SSCRMB      -3.762e+00  1.469e+01  -0.256 0.797985    
## SCRMB30      1.032e+01  1.517e+01   0.680 0.496739    
## three_PuttA  1.924e+02  1.441e+02   1.335 0.182365    
## BirdieAvg   -1.230e+01  3.998e+00  -3.077 0.002180 ** 
## BBack       -1.923e+01  3.012e+01  -0.638 0.523413    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.66 on 629 degrees of freedom
## Multiple R-squared:  0.132,  Adjusted R-squared:  0.1127 
## F-statistic: 6.833 on 14 and 629 DF,  p-value: 3.98e-13

Modelagem estatística

# Remover variáveis não significativas para simplificar o modelo
formula_simplified <- FINISH ~ Year + OWGR + DDist + SCRMB + BirdieAvg
modelo_simplified <- lm(formula_simplified, data = data_vif_reduced)
summary(modelo_simplified)

## 
## Call:
## lm(formula = formula_simplified, data = data_vif_reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.904 -17.262  -2.171  15.804  57.646 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.224e+03  4.329e+02  -2.827  0.00485 ** 
## Year         7.168e-01  2.233e-01   3.210  0.00139 ** 
## OWGR         2.379e-02  1.242e-02   1.915  0.05595 .  
## DDist       -2.414e-01  1.124e-01  -2.147  0.03214 *  
## SCRMB       -1.177e+02  2.639e+01  -4.460 9.69e-06 ***
## BirdieAvg   -1.298e+01  3.303e+00  -3.930 9.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.62 on 638 degrees of freedom
## Multiple R-squared:  0.1234, Adjusted R-squared:  0.1165 
## F-statistic: 17.96 on 5 and 638 DF,  p-value: < 2.2e-16

#Calcular os coeficientes padronizados para comparar a influência relativa das variáveis significativas
lm.beta(modelo_simplified)

##        Year        OWGR       DDist       SCRMB   BirdieAvg 
##  0.14706058  0.07941543 -0.10813029 -0.18950444 -0.20427865

#Criar um gráfico de barras para representar os coeficientes padronizados
beta_values <- data.frame(
  Variable = c("Year", "OWGR", "DDist", "SCRMB", "BirdieAvg"),
  Beta = c(0.147, 0.079, -0.108, -0.190, -0.204)
)
ggplot(beta_values, aes(x = reorder(Variable, Beta), y = Beta, fill = Beta > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Impacto Relativo das Variáveis (Coeficientes Padronizados)",
       x = "Variáveis", y = "Coeficiente Beta") +
  theme_minimal()