library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(dslabs)
library(writexl)
library(lmtest)
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(plotly)
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## corrplot 0.94 loaded
library(readxl)
library(reshape2) # Necesaria para transformar los datos
salesbike<- read_xlsx("Sales.xlsx")
head(salesbike)
## # A tibble: 6 × 18
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novemb… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novemb… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## # ℹ 11 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Unit_Cost <dbl>,
## # Unit_Price <dbl>, Profit <dbl>, Cost <dbl>, Revenue <dbl>
totals_by_country <- salesbike %>%
group_by(Country) %>%
summarise(
Total_Unit_Cost = sum(Unit_Cost, na.rm = TRUE),
Total_Unit_Price = sum(Unit_Price, na.rm = TRUE)
)
totals_by_country
## # A tibble: 6 × 3
## Country Total_Unit_Cost Total_Unit_Price
## <chr> <dbl> <dbl>
## 1 Australia 8714208 14651442
## 2 Canada 1894558 3242396
## 3 France 3213340 5416258
## 4 Germany 3339196 5637392
## 5 United Kingdom 3668832 6205596
## 6 United States 9383978 16045264
# Crear la gráfica de barras del Total_Unit_Cost por país
ggplot(totals_by_country, aes(x = reorder(Country, -Total_Unit_Cost), y = Total_Unit_Cost)) +
geom_bar(stat = "identity", fill = "steelblue") + # Barras con Total_Unit_Cost
labs(title = "Total Unit Cost por País",
x = "País",
y = "Total Unit Cost") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotar etiquetas de los países
# Crear la gráfica de barras del Total_Unit_Price por país
ggplot(totals_by_country, aes(x = reorder(Country, -Total_Unit_Price), y = Total_Unit_Price)) +
geom_bar(stat = "identity", fill = "steelblue") + # Barras con Total_Unit_Price
labs(title = "Total Unit Price por País",
x = "País",
y = "Total Unit Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotar etiquetas de los países
# Crear la gráfica de pie interactiva
pie_chart_cost <- plot_ly(totals_by_country, labels = ~Country, values = ~Total_Unit_Cost, type = 'pie') %>%
layout(title = 'Distribución de Total Unit Cost por País')
# Mostrar la gráfica interactiva
pie_chart_cost
# Crear la gráfica de pie interactiva para Total_Unit_Price
pie_chart_price <- plot_ly(totals_by_country, labels = ~Country, values = ~Total_Unit_Price, type = 'pie') %>%
layout(title = 'Distribución de Total Unit Price por País')
# Mostrar la gráfica interactiva
pie_chart_price
# Crear la gráfica de barras interactiva
bar_chart <- plot_ly(totals_by_country, x = ~Country, y = ~Total_Unit_Cost, type = 'bar', name = 'Total Unit Cost',
marker = list(color = 'blue')) %>%
add_trace(y = ~Total_Unit_Price, name = 'Total Unit Price', marker = list(color = 'red')) %>%
layout(title = 'Total Unit Cost y Total Unit Price por País',
xaxis = list(title = 'País'),
yaxis = list(title = 'Valor Total'),
barmode = 'group') # Modo de barras agrupadas
# Mostrar la gráfica interactiva
bar_chart
# Crear la gráfica de dispersión interactiva para Total_Unit_Cost y Total_Unit_Price por país
scatter_plot <- plot_ly(totals_by_country, x = ~Total_Unit_Cost, y = ~Total_Unit_Price,
text = ~Country, mode = 'markers', marker = list(size = 10)) %>%
layout(title = 'Relación entre Total Unit Cost y Total Unit Price por País',
xaxis = list(title = 'Total Unit Cost'),
yaxis = list(title = 'Total Unit Price'))
# Mostrar la gráfica interactiva
scatter_plot
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
#Matriz de correlación
# Selecciona solo las variables numéricas
numeric_vars <- salesbike[, sapply(salesbike, is.numeric)]
# Calcula la matriz de correlación
correlation_matrix <- cor(numeric_vars)
#correlation_matrix
corrplot(correlation_matrix, method = "number", tl.col = "black")
## En la matriz de correlación se puede observa que las variables con un
buen grado de correlación son: Unit_Cost y Unit_Price.
correlations <- totals_by_country %>%
summarise(
Pearson_Correlation = cor(Total_Unit_Cost, Total_Unit_Price),method = "pearson")
correlations
## # A tibble: 1 × 2
## Pearson_Correlation method
## <dbl> <chr>
## 1 1.00 pearson
# Ajustar el modelo de regresión lineal simple entre Unit_Cost y Unit_Price totalizados por país
model <- lm(Total_Unit_Price ~ Total_Unit_Cost, data = totals_by_country)
# Obtener los coeficientes del modelo
coeficientes <- coef(model)
# Extraer la pendiente (slope) y el intercepto
pendiente <- coeficientes["Total_Unit_Cost"]
intercepto <- coeficientes["(Intercept)"]
# Mostrar los resultados
cat("La pendiente (slope) es:", pendiente, "\n")
## La pendiente (slope) es: 1.698606
cat("El intercepto es:", intercepto, "\n")
## El intercepto es: -20589.09
# Mostrar los resultados
summary(model)
##
## Call:
## lm(formula = Total_Unit_Price ~ Total_Unit_Cost, data = totals_by_country)
##
## Residuals:
## 1 2 3 4 5 6
## -129978 44877 -21353 -13998 -5716 126168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.059e+04 7.713e+04 -0.267 0.803
## Total_Unit_Cost 1.699e+00 1.328e-02 127.950 2.24e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94220 on 4 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9997
## F-statistic: 1.637e+04 on 1 and 4 DF, p-value: 2.238e-08
# Ajustar el modelo de regresión lineal entre Total_Unit_Cost y Total_Unit_Price
model <- lm(Total_Unit_Price ~ Total_Unit_Cost, data = totals_by_country)
# Calcular los residuales del modelo
residuales <- residuals(model)
# Realizar la prueba t para la media de los residuales
t_test_result <- t.test(residuales, mu = 0)
# Mostrar el resultado de la prueba t
print(t_test_result)
##
## One Sample t-test
##
## data: residuales
## t = -3.1722e-16, df = 5, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -88441.03 88441.03
## sample estimates:
## mean of x
## -1.091394e-11
# Crear el histograma de los residuales
ggplot(data.frame(residuales), aes(x = residuales)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
stat_function(fun = dnorm, args = list(mean = mean(residuales), sd = sd(residuales)),
color = "red", size = 1) +
labs(title = "Distribución de los Residuales con Curva Normal Superpuesta",
x = "Residuales",
y = "Densidad") +
geom_vline(xintercept = mean(residuales), linetype = "dashed", color = "blue") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Crear la gráfica utilizando ggplot2
ggplot_plot <- ggplot(totals_by_country, aes(x = Total_Unit_Cost, y = Total_Unit_Price)) +
geom_point(color = "blue", size = 3) + # Puntos de los datos totalizados
geom_smooth(method = "lm", color = "red", se = FALSE) + # Línea de regresión ajustada
labs(title = "Relación entre Unit Cost y Unit Price Totalizados por País",
x = "Total Unit Cost",
y = "Total Unit Price") +
theme_minimal()
# Convertir el gráfico a interactivo usando plotly
interactive_plot <- ggplotly(ggplot_plot)
## `geom_smooth()` using formula = 'y ~ x'
# Mostrar la gráfica interactiva
interactive_plot
#Analisis de los residuales
ei<-residuals(model);ei
## 1 2 3 4 5 6
## -129977.965 44876.839 -21352.643 -13998.443 -5716.247 126168.459
plot(ei)
## Los errores tienen media cero
# Calcular la media de los residuales
media_residuales <- mean(ei)
# Crear el gráfico de residuales
residuals_plot <- ggplot(data = data.frame(residuals = ei, fitted = model$fitted.values),
aes(x = fitted, y = residuals)) +
geom_point(color = "blue") +
geom_hline(yintercept = media_residuales, color = "red", linetype = "dashed") +
ggtitle("Gráfico de Residuales con Línea de la Media") +
xlab("Valores Predichos") +
ylab("Residuales") +
coord_cartesian(ylim = c(-10^6, 10^6)) +
theme_minimal()
# Mostrar el gráfico
residuals_plot
pred<-sort(fitted(model));pred
## 2 3 4 5 1 6
## 3197519 5437611 5651390 6211312 14781420 15919096
#Detección de puntos influyentes
#plot(cooks.distance(modelo))
cooks_d <- cooks.distance(model)
cooks_df <- data.frame(
Index = seq_along(cooks_d),
CooksDistance = cooks_d
)
cooks_plot <- ggplot(cooks_df, aes(x = Index, y = CooksDistance)) +
geom_point(color = "blue") + # Puntos de Cook's distance
geom_hline(yintercept = 4 / length(cooks_d), color = "red", linetype = "dashed") + # Línea de referencia
ggtitle("Gráfico de Cook's Distance") +
xlab("Índice del Observación") +
ylab("Cook's Distance") +
coord_cartesian(ylim = c(-5, 5)) +
theme_minimal()
# Mostrar el gráfico
cooks_plot
t.test(ei)
##
## One Sample t-test
##
## data: ei
## t = -3.1722e-16, df = 5, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -88441.03 88441.03
## sample estimates:
## mean of x
## -1.091394e-11
# Q-Q Plot de los residuales
qq_plot <- ggplot(data = data.frame(residuals = ei), aes(sample = residuals)) +
stat_qq() +
stat_qq_line() +
ggtitle("Q-Q Plot de los Residuales") +
coord_cartesian(ylim = c(-10^6, 10^6)) +
theme_minimal()
# Mostrar el gráfico
qq_plot
shapiro.test(ei)
##
## Shapiro-Wilk normality test
##
## data: ei
## W = 0.95357, p-value = 0.769
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 5.4857, df = 1, p-value = 0.01917
dwtest(model,alternative = "two.sided")
##
## Durbin-Watson test
##
## data: model
## DW = 1.4778, p-value = 0.3892
## alternative hypothesis: true autocorrelation is not 0