Para este análisis hemos tomado Iris de python, la hemos convertido en un archivo pkl, lo volvimos a abrir y lo exportamos como csv con el propósito de hacer esto con el archivo de datos del paper titulado “Scalable hierarchical clustering”

Primero cargamos la database exportado desde python

library(readr)
iris_unpickled <- read_csv("~/Documents/uni/Práctica profesionales/Code/iris_unpickled.csv")
## Rows: 150 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): sepal length (cm), sepal width (cm), petal length (cm), petal width...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(iris_unpickled)

Ahora para conocer las relaciones existentes, hacemos una matriz de diagramas de dispersión:

pairs(iris_unpickled)

Y para cuantificarlo obtenemos la matriz de coeficientes de correlación, realizamos un heatmap plot para visualizar más facilmente la información

#Hacemos el heatmap plot
  library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
  library(reshape2)
  
  # Crear una matriz de correlación

  cor_matrix <- cor(iris_unpickled)
  
  # Obtener solo el triángulo inferior
  get_upper_tri <- function(cor_matrix){
    cor_matrix[lower.tri(cor_matrix)]<- NA
    return(cor_matrix)
  }
  reorder_cormat <- function(cor_matrix){
    # Use correlation between variables as distance
    dd <- as.dist((1-cor_matrix)/2)
    hc <- hclust(dd)
    cor_matrix <-cor_matrix[hc$order, hc$order]
  }
  
  cor_matrix <- reorder_cormat(cor_matrix)

  upper_tri <- get_upper_tri(cor_matrix)

  # Convertir la matriz a formato largo (long format)
  melted_lower_tri <- melt(upper_tri, na.rm = TRUE)  # Remover los NA para graficar
  
  # Crear el heatmap con Plotly
  heatmap <- plot_ly(
    x = colnames(upper_tri),       # Nombres de las columnas
    y = rownames(upper_tri),       # Nombres de las filas
    z = upper_tri,                 # Triángulo inferior de la matriz
    type = "heatmap",              # Tipo de gráfico
    #colorscale = "RdBu",           # Escala de colores
    colorbar = list(title = "Correlación")  # Título de la barra de color
  )%>%
  layout(
    xaxis = list(scaleanchor = "y"),  # Vincular escala de x con y
    yaxis = list(scaleanchor = "x"),  # Vincular escala de y con x
    title = "Heatmap correlaciones Iris"
  )
  
  # Mostrar el heatmap
  heatmap

Podemos notar una fuerte relación lineal entre las variables:

Realizamos los ajustes lineales usando mínimos cuadrados correspondientes:

colnames(iris_unpickled)
## [1] "sepal length (cm)" "sepal width (cm)"  "petal length (cm)"
## [4] "petal width (cm)"
reg_pl_sl<- lm(`petal length (cm)`~ `sepal length (cm)`, data = iris_unpickled)
summary(reg_pl_sl)
## 
## Call:
## lm(formula = `petal length (cm)` ~ `sepal length (cm)`, data = iris_unpickled)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47747 -0.59072 -0.00668  0.60484  2.49512 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -7.10144    0.50666  -14.02   <2e-16 ***
## `sepal length (cm)`  1.85843    0.08586   21.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8678 on 148 degrees of freedom
## Multiple R-squared:   0.76,  Adjusted R-squared:  0.7583 
## F-statistic: 468.6 on 1 and 148 DF,  p-value: < 2.2e-16

Podemos observar la relación con un scatterplot:

plot(iris_unpickled$`sepal length (cm)`, iris_unpickled$`petal length (cm)`, xlab="Sepal Length", ylab="Petal length", main="Scatterplot with regression line")
abline(reg_pl_sl, col="red")

reg_pw_sl<- lm(`petal width (cm)`~ `sepal length (cm)`, data = iris_unpickled)
summary(reg_pw_sl)
## 
## Call:
## lm(formula = `petal width (cm)` ~ `sepal length (cm)`, data = iris_unpickled)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96671 -0.35936 -0.01787  0.28388  1.23329 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -3.20022    0.25689  -12.46   <2e-16 ***
## `sepal length (cm)`  0.75292    0.04353   17.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.44 on 148 degrees of freedom
## Multiple R-squared:  0.669,  Adjusted R-squared:  0.6668 
## F-statistic: 299.2 on 1 and 148 DF,  p-value: < 2.2e-16
plot(iris_unpickled$`sepal length (cm)`, iris_unpickled$`petal width (cm)`, xlab="Sepal Length", ylab="Petal width", main="Scatterplot with regression line")
abline(reg_pw_sl, col="red")

reg_pw_pl<- lm(`petal width (cm)`~ `petal length (cm)`, data = iris_unpickled)
summary(reg_pw_pl)
## 
## Call:
## lm(formula = `petal width (cm)` ~ `petal length (cm)`, data = iris_unpickled)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56515 -0.12358 -0.01898  0.13288  0.64272 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.363076   0.039762  -9.131  4.7e-16 ***
## `petal length (cm)`  0.415755   0.009582  43.387  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2065 on 148 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9266 
## F-statistic:  1882 on 1 and 148 DF,  p-value: < 2.2e-16
plot(iris_unpickled$`petal length (cm)`, iris_unpickled$`petal width (cm)`, xlab="Petal Length", ylab="Petal width", main="Scatterplot with regression line")
abline(reg_pw_pl, col="red")

```