Para este análisis hemos tomado Iris de python, la hemos convertido en un archivo pkl, lo volvimos a abrir y lo exportamos como csv con el propósito de hacer esto con el archivo de datos del paper titulado “Scalable hierarchical clustering”
Primero cargamos la database exportado desde python
library(readr)
iris_unpickled <- read_csv("~/Documents/uni/Práctica profesionales/Code/iris_unpickled.csv")
## Rows: 150 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): sepal length (cm), sepal width (cm), petal length (cm), petal width...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(iris_unpickled)
Ahora para conocer las relaciones existentes, hacemos una matriz de diagramas de dispersión:
pairs(iris_unpickled)
Y para cuantificarlo obtenemos la matriz de coeficientes de correlación, realizamos un heatmap plot para visualizar más facilmente la información
#Hacemos el heatmap plot
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
# Crear una matriz de correlación
cor_matrix <- cor(iris_unpickled)
# Obtener solo el triángulo inferior
get_upper_tri <- function(cor_matrix){
cor_matrix[lower.tri(cor_matrix)]<- NA
return(cor_matrix)
}
reorder_cormat <- function(cor_matrix){
# Use correlation between variables as distance
dd <- as.dist((1-cor_matrix)/2)
hc <- hclust(dd)
cor_matrix <-cor_matrix[hc$order, hc$order]
}
cor_matrix <- reorder_cormat(cor_matrix)
upper_tri <- get_upper_tri(cor_matrix)
# Convertir la matriz a formato largo (long format)
melted_lower_tri <- melt(upper_tri, na.rm = TRUE) # Remover los NA para graficar
# Crear el heatmap con Plotly
heatmap <- plot_ly(
x = colnames(upper_tri), # Nombres de las columnas
y = rownames(upper_tri), # Nombres de las filas
z = upper_tri, # Triángulo inferior de la matriz
type = "heatmap", # Tipo de gráfico
#colorscale = "RdBu", # Escala de colores
colorbar = list(title = "Correlación") # Título de la barra de color
)%>%
layout(
xaxis = list(scaleanchor = "y"), # Vincular escala de x con y
yaxis = list(scaleanchor = "x"), # Vincular escala de y con x
title = "Heatmap correlaciones Iris"
)
# Mostrar el heatmap
heatmap
Podemos notar una fuerte relación lineal entre las variables:
petal length vs sepal length
petal width vs sepal length
petal width vs petal length.
Realizamos los ajustes lineales usando mínimos cuadrados correspondientes:
colnames(iris_unpickled)
## [1] "sepal length (cm)" "sepal width (cm)" "petal length (cm)"
## [4] "petal width (cm)"
reg_pl_sl<- lm(`petal length (cm)`~ `sepal length (cm)`, data = iris_unpickled)
summary(reg_pl_sl)
##
## Call:
## lm(formula = `petal length (cm)` ~ `sepal length (cm)`, data = iris_unpickled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47747 -0.59072 -0.00668 0.60484 2.49512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.10144 0.50666 -14.02 <2e-16 ***
## `sepal length (cm)` 1.85843 0.08586 21.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8678 on 148 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.7583
## F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
Podemos observar la relación con un scatterplot:
plot(iris_unpickled$`sepal length (cm)`, iris_unpickled$`petal length (cm)`, xlab="Sepal Length", ylab="Petal length", main="Scatterplot with regression line")
abline(reg_pl_sl, col="red")
reg_pw_sl<- lm(`petal width (cm)`~ `sepal length (cm)`, data = iris_unpickled)
summary(reg_pw_sl)
##
## Call:
## lm(formula = `petal width (cm)` ~ `sepal length (cm)`, data = iris_unpickled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96671 -0.35936 -0.01787 0.28388 1.23329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.20022 0.25689 -12.46 <2e-16 ***
## `sepal length (cm)` 0.75292 0.04353 17.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.44 on 148 degrees of freedom
## Multiple R-squared: 0.669, Adjusted R-squared: 0.6668
## F-statistic: 299.2 on 1 and 148 DF, p-value: < 2.2e-16
plot(iris_unpickled$`sepal length (cm)`, iris_unpickled$`petal width (cm)`, xlab="Sepal Length", ylab="Petal width", main="Scatterplot with regression line")
abline(reg_pw_sl, col="red")
reg_pw_pl<- lm(`petal width (cm)`~ `petal length (cm)`, data = iris_unpickled)
summary(reg_pw_pl)
##
## Call:
## lm(formula = `petal width (cm)` ~ `petal length (cm)`, data = iris_unpickled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56515 -0.12358 -0.01898 0.13288 0.64272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.363076 0.039762 -9.131 4.7e-16 ***
## `petal length (cm)` 0.415755 0.009582 43.387 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2065 on 148 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
## F-statistic: 1882 on 1 and 148 DF, p-value: < 2.2e-16
plot(iris_unpickled$`petal length (cm)`, iris_unpickled$`petal width (cm)`, xlab="Petal Length", ylab="Petal width", main="Scatterplot with regression line")
abline(reg_pw_pl, col="red")
```