Objetivo general
Aplicar conceptos y tecnicas de programacion.
Analizar un conjunto de datos desde multiples perspectivas y formas.
Objetivos especificos
Explorar y describir el dataset.
Realizar un notebook con un informe que detalle el comportamiento de cada una de las variables y las relaciones entre las mismas. Incluyendo todos los graficos que consideres necesarios para describir el fenomeno.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.3
library(DT)
## Warning: package 'DT' was built under R version 4.1.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
setwd("C:/Users/YesicaRodriguez/OneDrive - SEI/Documents/Coderhouse")
datasteel <- read.csv("Steel_industry_data.csv")
str(datasteel)
## 'data.frame': 35040 obs. of 11 variables:
## $ ï..Date : chr "01/01/2018 00:15" "01/01/2018 00:30" "01/01/2018 00:45" "01/01/2018 01:00" ...
## $ Usage_kWh : num 3.17 4 3.24 3.31 3.82 3.28 3.6 3.6 3.28 3.78 ...
## $ Lagging_Current_Reactive_Power_kVarh: num 2.95 4.46 3.28 3.56 4.5 3.56 4.14 4.28 3.64 4.72 ...
## $ Leading_Current_Reactive_Power_kVarh: num 0 0 0 0 0 0 0 0 0 0 ...
## $ CO2_tCO2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Lagging_Current_Power_Factor : num 73.2 66.8 70.3 68.1 64.7 ...
## $ Leading_Current_Power_Factor : num 100 100 100 100 100 100 100 100 100 100 ...
## $ NSM : int 900 1800 2700 3600 4500 5400 6300 7200 8100 9000 ...
## $ Week_Status : chr "Weekday" "Weekday" "Weekday" "Weekday" ...
## $ Day_of_week : chr "Monday" "Monday" "Monday" "Monday" ...
## $ Load_Type : chr "Light_Load" "Light_Load" "Light_Load" "Light_Load" ...
names(datasteel)
## [1] "ï..Date"
## [2] "Usage_kWh"
## [3] "Lagging_Current_Reactive_Power_kVarh"
## [4] "Leading_Current_Reactive_Power_kVarh"
## [5] "CO2_tCO2"
## [6] "Lagging_Current_Power_Factor"
## [7] "Leading_Current_Power_Factor"
## [8] "NSM"
## [9] "Week_Status"
## [10] "Day_of_week"
## [11] "Load_Type"
if(any(is.na(datasteel))) {
na.omit(datasteel)
} else {
print("La base de datos no tiene datos NA")
}
## [1] "La base de datos no tiene datos NA"
ggplot(data = datasteel) + geom_point(mapping = aes(x = Lagging_Current_Reactive_Power_kVarh, y = Lagging_Current_Power_Factor, color = Load_Type))
ggplot(data = datasteel) + geom_bar(mapping = aes(x = Load_Type, fill = Day_of_week), position = "fill")
resumen1 <- datasteel %>%
mutate(
Day_of_week = factor(Day_of_week),
Load_Type = factor(Load_Type)) %>%
group_by(Day_of_week, Load_Type) %>%
summarise(media_KWh = mean(Usage_kWh), .groups = "drop")
resumen1 %>%
DT::datatable(
rownames = FALSE,
filter = 'top')
resumen2 <- datasteel %>%
mutate(
Day_of_week = factor(Day_of_week),
Week_Status = factor(Week_Status)
) %>%
filter(Lagging_Current_Power_Factor <= 80) %>%
group_by(Day_of_week, Week_Status) %>%
summarise(varianza_KWh = var(Usage_kWh), .groups = "drop")
resumen2 %>%
DT::datatable(
rownames = FALSE,
filter = 'top')
summary(datasteel)
## ï..Date Usage_kWh Lagging_Current_Reactive_Power_kVarh
## Length:35040 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 3.20 1st Qu.: 2.30
## Mode :character Median : 4.57 Median : 5.00
## Mean : 27.39 Mean :13.04
## 3rd Qu.: 51.24 3rd Qu.:22.64
## Max. :157.18 Max. :96.91
## Leading_Current_Reactive_Power_kVarh CO2_tCO2
## Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:0.00000
## Median : 0.000 Median :0.00000
## Mean : 3.871 Mean :0.01152
## 3rd Qu.: 2.090 3rd Qu.:0.02000
## Max. :27.760 Max. :0.07000
## Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.: 63.32 1st Qu.: 99.70 1st Qu.:21375
## Median : 87.96 Median :100.00 Median :42750
## Mean : 80.58 Mean : 84.37 Mean :42750
## 3rd Qu.: 99.02 3rd Qu.:100.00 3rd Qu.:64125
## Max. :100.00 Max. :100.00 Max. :85500
## Week_Status Day_of_week Load_Type
## Length:35040 Length:35040 Length:35040
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
datasteel2 <- datasteel %>%
select(-1, -9, -10, -11)
cor.test(datasteel2$Usage_kWh, datasteel2$Lagging_Current_Power_Factor)
##
## Pearson's product-moment correlation
##
## data: datasteel2$Usage_kWh and datasteel2$Lagging_Current_Power_Factor
## t = 78.314, df = 35038, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3770135 0.3948354
## sample estimates:
## cor
## 0.3859605
cor(datasteel2)
## Usage_kWh
## Usage_kWh 1.0000000
## Lagging_Current_Reactive_Power_kVarh 0.8961499
## Leading_Current_Reactive_Power_kVarh -0.3249218
## CO2_tCO2 0.9881798
## Lagging_Current_Power_Factor 0.3859605
## Leading_Current_Power_Factor 0.3535657
## NSM 0.2346103
## Lagging_Current_Reactive_Power_kVarh
## Usage_kWh 0.89614990
## Lagging_Current_Reactive_Power_kVarh 1.00000000
## Leading_Current_Reactive_Power_kVarh -0.40514168
## CO2_tCO2 0.88694771
## Lagging_Current_Power_Factor 0.14453376
## Leading_Current_Power_Factor 0.40771628
## NSM 0.08266237
## Leading_Current_Reactive_Power_kVarh
## Usage_kWh -0.3249218
## Lagging_Current_Reactive_Power_kVarh -0.4051417
## Leading_Current_Reactive_Power_kVarh 1.0000000
## CO2_tCO2 -0.3327766
## Lagging_Current_Power_Factor 0.5267705
## Leading_Current_Power_Factor -0.9440390
## NSM 0.3716046
## CO2_tCO2 Lagging_Current_Power_Factor
## Usage_kWh 0.9881798 0.3859605
## Lagging_Current_Reactive_Power_kVarh 0.8869477 0.1445338
## Leading_Current_Reactive_Power_kVarh -0.3327766 0.5267705
## CO2_tCO2 1.0000000 0.3796047
## Lagging_Current_Power_Factor 0.3796047 1.0000000
## Leading_Current_Power_Factor 0.3600191 -0.5199669
## NSM 0.2317260 0.5652695
## Leading_Current_Power_Factor NSM
## Usage_kWh 0.3535657 0.23461033
## Lagging_Current_Reactive_Power_kVarh 0.4077163 0.08266237
## Leading_Current_Reactive_Power_kVarh -0.9440390 0.37160457
## CO2_tCO2 0.3600191 0.23172600
## Lagging_Current_Power_Factor -0.5199669 0.56526951
## Leading_Current_Power_Factor 1.0000000 -0.36056299
## NSM -0.3605630 1.00000000