This analyses is between 2016 and 2021. We want to see quality
Producción top de los 10 mejores investigadores
Tabla
researcher_model_1 <-
researchers |>
select(articulos, inicio_vinculacion) |>
separate_rows(inicio_vinculacion, sep = "; ") |>
separate_rows(articulos, sep = "; ") |>
mutate(inicio_vinculacion = ymd(inicio_vinculacion),
dias_vinculados = today() - inicio_vinculacion) |>
select(-inicio_vinculacion) |>
mutate(articulos = as.numeric(articulos),
dias_vinculados = as.numeric(dias_vinculados))
Checking histograms
par(mfrow = c(1,2))
researcher_model_1 |>
ggplot(aes(x = articulos)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
researcher_model_1 |>
ggplot(aes(x = dias_vinculados)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Checking Q-Q plot
plot.new()
par(mfrow = c(1,2))
qqnorm(researcher_model_1$articulos, main='Normal')
qqline(researcher_model_1$articulos)
qqnorm(researcher_model_1$dias_vinculados, main='Non-normal')
qqline(researcher_model_1$dias_vinculados)
Shapiro-wilk test
shapiro.test(researcher_model_1$articulos)
##
## Shapiro-Wilk normality test
##
## data: researcher_model_1$articulos
## W = 0.78283, p-value = 3.746e-16
shapiro.test(researcher_model_1$dias_vinculados)
##
## Shapiro-Wilk normality test
##
## data: researcher_model_1$dias_vinculados
## W = 0.89067, p-value = 4.555e-11
p-value less than 0.05 - not normally distributed
We need to transform the data
articulos_bestnor <-
bestNormalize(researcher_model_1$articulos)
articulos_trans <-
predict(articulos_bestnor,
newdata = articulos_bestnor$x.t,
inverse = TRUE)
dias_vinculados_bestnor <-
bestNormalize(researcher_model_1$dias_vinculados)
dias_vinculados_trans <-
predict(dias_vinculados_bestnor, newdata = dias_vinculados_bestnor$x.t,
inverse = TRUE)
researcher_trans <-
tibble(articulos = articulos_trans,
dias_vinculados = dias_vinculados_trans)
model_1 <-
lm(articulos ~ dias_vinculados, researcher_trans)
summary(model_1)
##
## Call:
## lm(formula = articulos ~ dias_vinculados, data = researcher_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.084 -3.126 -1.381 1.688 25.805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5038607 0.6873586 0.733 0.464
## dias_vinculados 0.0022295 0.0002898 7.693 6.13e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.393 on 203 degrees of freedom
## Multiple R-squared: 0.2257, Adjusted R-squared: 0.2219
## F-statistic: 59.18 on 1 and 203 DF, p-value: 6.127e-13
Checking supositions
check_model(model_1)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Loading required namespace: qqplotr
Report
report(model_1)
## We fitted a linear model (estimated using OLS) to predict articulos with dias_vinculados (formula: articulos ~ dias_vinculados). The model explains a statistically significant and moderate proportion of variance (R2 = 0.23, F(1, 203) = 59.18, p < .001, adj. R2 = 0.22). The model's intercept, corresponding to dias_vinculados = 0, is at 0.50 (95% CI [-0.85, 1.86], t(203) = 0.73, p = 0.464). Within this model:
##
## - The effect of dias vinculados is statistically significant and positive (beta = 2.23e-03, 95% CI [1.66e-03, 2.80e-03], t(203) = 7.69, p < .001; Std. beta = 0.48, 95% CI [0.35, 0.60])
##
## Standardized parameters were obtained by fitting the model on a standardized version of the dataset. 95% Confidence Intervals (CIs) and p-values were computed using the Wald approximation.
researchres_split <-
initial_split(researcher_trans,
prop = 0.75,
strata = articulos)
researchers_training <-
researchres_split |>
training()
researchers_testing <-
researchres_split |>
testing()
lm_model <-
linear_reg() |>
set_engine("lm") |>
set_mode("regression")
lm_fit <-
lm_model |>
fit(articulos ~ dias_vinculados,
researchers_training)
tidy(lm_fit)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.629 0.780 0.806 0.421
## 2 dias_vinculados 0.00227 0.000350 6.49 0.00000000119
Making predictions
researchers_predictions <-
lm_fit |>
predict(new_data = researchers_testing)
researchers_test_results <-
researchers_testing |>
select(articulos, dias_vinculados) |>
bind_cols(researchers_predictions)
Evaluating the model performance
researchers_test_results |>
yardstick::rmse(truth = articulos,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.87
r2 metric
researchers_test_results |>
rsq(truth = articulos,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.250
ggplot(researchers_test_results, aes(x = articulos, y = .pred)) +
geom_point(alpha = 0.5) +
geom_abline(color = 'blue', linetype = 2) +
coord_obs_pred() +
labs(x = 'articulos actuales', y = 'Predicted articles')
lm_last_fit <-
lm_model |>
last_fit(articulos ~ dias_viculados,
split = researchres_split)
## x train/test split: preprocessor 1/1: Error in `glubort()`:
## ! The following predictors were ...
## Warning: All models failed. See the `.notes` column.
lm_last_fit |>
collect_metrics()
## NULL
lm_last_fit |>
collect_predictions()
## # A tibble: 0 × 1
## # … with 1 variable: id <chr>
We need to transform the values of articles variable
researcher_model_2 <-
researcher_model_1 |>
mutate(articulos_bin = if_else(articulos <= 3,
"low",
"high"),
articulos_bin = as_factor(articulos_bin)) |>
select(-articulos)
Data resampling
researcher_bin_split <-
initial_split(researcher_model_2,
prop = 0.75,
strata = "articulos_bin")
researcher_bin_training <-
researcher_bin_split |>
training()
researcher_bin_test <-
researcher_bin_split |>
testing()
Logistic Regression model
logistic_model <-
logistic_reg() |>
set_engine("glm") |>
set_mode("classification")
Model fitting
logistic_fit <-
logistic_model |>
parsnip::fit(articulos_bin ~ dias_vinculados,
data = researcher_bin_training)
Predicting outcome categories
class_preds <-
logistic_fit |>
predict(new_data = researcher_bin_test,
type = "class")
Estimated probabilities
prob_preds <-
logistic_fit |>
predict(new_data = researcher_bin_test,
type = "prob")
Combining results
researcher_bin_results <-
researcher_bin_test |>
bind_cols(class_preds, prob_preds)
Assessing model fit
levels(researcher_model_2$articulos_bin)
## [1] "high" "low"
Confusion matrix
conf_mat(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class)
## Truth
## Prediction high low
## high 16 4
## low 7 25
62.5% correctly classified
accuracy(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.788
Sensitivity proportion of all positive cases that were correctly classified of reserachers who had high productivity, what proportion did our model predict correctly?
sens(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.696
spec(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 spec binary 0.862
Creating a metric set
custom_metrics <-
metric_set(accuracy, sens, spec)
custom_metrics(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.788
## 2 sens binary 0.696
## 3 spec binary 0.862
conf_mat(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class) |>
summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.788
## 2 kap binary 0.565
## 3 sens binary 0.696
## 4 spec binary 0.862
## 5 ppv binary 0.8
## 6 npv binary 0.781
## 7 mcc binary 0.569
## 8 j_index binary 0.558
## 9 bal_accuracy binary 0.779
## 10 detection_prevalence binary 0.385
## 11 precision binary 0.8
## 12 recall binary 0.696
## 13 f_meas binary 0.744
Plotting the confusion matrix
conf_mat(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class) |>
autoplot(type = "heatmap")
conf_mat(researcher_bin_results,
truth = articulos_bin,
estimate = .pred_class) |>
autoplot(type = "mosaic")
Calculating performance
researcher_bin_results |>
roc_curve(truth = articulos_bin, .pred_high) |>
autoplot()
# Load file wos.csv and save it as df
roc_auc(researcher_bin_results,
truth = articulos_bin,
.pred_high)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.837