Arthur Martins Ferreira de Sousa

Visualizing Data, Rearranging Observations, Summary Tables, Plotting Histograms, Percentage Polygon, and Cumulative Percentage Polygon

Importing Libraries

library(knitr)
library(kableExtra)
library(tidyverse)
library(gtsummary)
library(hrbrthemes)
library (plotly)
library(ggrepel)
library(kableExtra)

Reading Data-set

setwd("C:/Users/arthu/OneDrive/Área de Trabalho/R_projects/trabalhos de VED/Databases")
getwd()
data = readxl::read_xlsx("lampadas_trab3.xlsx")
df = data.frame(fabricante = data$Fabricante, vida = data$`Vida Útil` )
df2 = arrange(df, vida)

Data-set
Manufacturer	Life-spam
1	684
1	697
1	720
1	773
2	819
1	821
1	831
1	835
2	836
1	848
1	852
1	852
1	859
1	860
1	868
1	870
1	876
2	888
1	893
2	897
1	899
2	903
1	905
2	907
1	909
1	911
2	912
2	918
1	922
1	924
1	926
1	926
1	938
1	939
2	942
1	943
2	943
1	946
2	952
1	954
2	959
2	962
1	971
1	972
1	977
1	984
2	986
2	992
2	994
2	1004
1	1005
2	1005
2	1007
1	1014
2	1015
1	1016
2	1016
2	1018
2	1020
2	1022
2	1034
2	1038
1	1041
1	1052
2	1072
2	1077
2	1077
1	1080
2	1082
1	1093
2	1096
2	1100
2	1113
2	1113
2	1116
2	1153
2	1154
2	1174
2	1188
2	1230

Selecting specific observations from data-frame column

k1 = df2[df2$fabricante==1,]

k2 = df2[df2$fabricante==2,]

# k_test = df2[df2$fabricante %in% c(1,2),] select and visualize more than one observation at the same time
# View(k_test)

Cumulative Frequency table - Manufacturer 1

k3 = k1 %>% 
  count(vida) %>% 
  mutate(Cum = (cumsum(n)/sum(n))*100) %>% 
  select(-n) %>% 
  right_join(k1) %>%
  select(names(k1), everything())

Data-set + Cumulative Freq
Manufacturer	Life-spam	Cumulative Percentage (%)
1	684	2.5
1	697	5.0
1	720	7.5
1	773	10.0
1	821	12.5
1	831	15.0
1	835	17.5
1	848	20.0
1	852	25.0
1	852	25.0
1	859	27.5
1	860	30.0
1	868	32.5
1	870	35.0
1	876	37.5
1	893	40.0
1	899	42.5
1	905	45.0
1	909	47.5
1	911	50.0
1	922	52.5
1	924	55.0
1	926	60.0
1	926	60.0
1	938	62.5
1	939	65.0
1	943	67.5
1	946	70.0
1	954	72.5
1	971	75.0
1	972	77.5
1	977	80.0
1	984	82.5
1	1005	85.0
1	1014	87.5
1	1016	90.0
1	1041	92.5
1	1052	95.0
1	1080	97.5
1	1093	100.0

Cumulative Frequency table - Manufacturer 2

k4 = k2 %>% 
  count(vida) %>% 
  mutate(Cum = (cumsum(n)/sum(n))*100) %>% 
  select(-n) %>% 
  right_join(k2) %>%
  select(names(k2), everything())

Data-set + Cumulative Freq
Manufacturer	Life-spam	Cumulative Percentage (%)
2	819	2.5
2	836	5.0
2	888	7.5
2	897	10.0
2	903	12.5
2	907	15.0
2	912	17.5
2	918	20.0
2	942	22.5
2	943	25.0
2	952	27.5
2	959	30.0
2	962	32.5
2	986	35.0
2	992	37.5
2	994	40.0
2	1004	42.5
2	1005	45.0
2	1007	47.5
2	1015	50.0
2	1016	52.5
2	1018	55.0
2	1020	57.5
2	1022	60.0
2	1034	62.5
2	1038	65.0
2	1072	67.5
2	1077	72.5
2	1077	72.5
2	1082	75.0
2	1096	77.5
2	1100	80.0
2	1113	85.0
2	1113	85.0
2	1116	87.5
2	1153	90.0
2	1154	92.5
2	1174	95.0
2	1188	97.5
2	1230	100.0

Grouping Life-spam ranges by specified intervals - Manufacturer 1

k5 = table(cut(k1$vida,seq(650,1150,100)))

Interval
Interval	Frequency (nº observations)
(650,750]	3
(750,850]	5
(850,950]	20
(950,1.05e+03]	9
(1.05e+03,1.15e+03]	3

Grouping Life-spam ranges by specified intervals - Manufacturer 2

k6 = table(cut(k2$vida, seq(750,1250,100)))

Interval
Interval	Frequency (nº observations)
(750,850]	2
(850,950]	8
(950,1.05e+03]	16
(1.05e+03,1.15e+03]	9
(1.15e+03,1.25e+03]	5

Summary Table

t = df2 %>%
  drop_na(fabricante) %>%
  group_by(fabricante) %>%
  summarise(Lower = min(vida), Average = mean(vida), SD = sd(vida), VAR = var(vida), 
            Upper = max(vida), Difference = max(vida)-min(vida)) %>%
  arrange(Average)
kable(t, caption = "Interval", align = "c") %>%
  row_spec(0, font_size = 15) %>%
  kable_styling()

Interval
fabricante	Lower	Average	SD	VAR	Upper	Difference
1	684	909.65	94.30516	8893.464	1093	409
2	819	1018.35	96.90136	9389.874	1230	411

Gtsummary for Summary Tables

df2 %>% 
  tbl_summary(label = c(vida ~ "LIFE-SPAM", fabricante ~ "MANUFACTURER"), 
                    statistic = list(all_continuous() ~ "{mean} ({median}) ({sd}) ({var})",
                                     all_categorical() ~ "{n} / {N} ({p}%)")) %>%
  modify_header(label ~ "*Variables*") %>%
  modify_spanning_header(label ~ "**LAMPS LIFESPAM**") %>%
  modify_footnote(label ~ "percentage, mean, median, sd, var")

LAMPS LIFESPAM	N = 80²
Variables¹	N = 80²
MANUFACTURER
1	40 / 80 (50%)
2	40 / 80 (50%)
LIFE-SPAM	964 (957) (110) (12,017)
¹ percentage, mean, median, sd, var
² n / N (%); Mean (Median) (SD) (Variance)

df2 %>% 
  select(c(fabricante)) %>% 
  tbl_summary()

Characteristic	N = 80¹
fabricante
1	40 (50%)
2	40 (50%)
¹ n (%)

df2 %>% 
  tbl_summary(by = fabricante)%>% 
  add_p()

Characteristic	1, N = 40¹	2, N = 40¹	p-value²
vida	917 (857, 971)	1,016 (950, 1,086)	<0.001
¹ Median (IQR)
² Wilcoxon rank sum test

Histogram Life-spam Manufacturer 1

p1 = ggplot(k1, aes(x = vida)) +
  geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100, 
                 fill= "#69b3a2", color= "grey", alpha=0.8, linewidth = 0.5) +
  stat_bin(
    binwidth = 100, geom = "text", color = "white",
    aes(y = after_stat(count / sum(count)), 
        label = scales::percent(after_stat(count / sum(count)))),
    position = position_stack(vjust = 0.85)) +
  labs(title = "Manufacturer 1", x = "Life-spam (hours)", y = "Percentage")+
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous(breaks = seq(650,1150, by = 100))+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()
ggplotly(p1)

Histogram Life-spam Manufacturer 2

p2 = ggplot(k2, aes(x = vida)) +
  geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100, 
                  fill= "#69b3a2", color= "grey", alpha=0.9, linewidth= 0.5) +
  stat_bin(binwidth = 100, geom = "text", color = "white",
    aes(y = after_stat(count / sum(count)), 
        label = scales::percent(after_stat(count / sum(count)))),
          position = position_stack(vjust = 0.75)) +
  labs(title = "Manufacturer 2", x = "Life-spam (hours)", y = "Percentage")+
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous(breaks = seq(750,1250, by = 100))+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()
ggplotly(p2)

Percentage Polygon from both Manufacturers

p3 = ggplot(k1, aes(x = vida, y=(..count../sum(count))*100, fill = fabricante))+
  geom_freqpoly(color = "cyan", binwidth = 100)+
  geom_freqpoly(data = k2, color = "#95FF00", binwidth = 100)+
  scale_x_continuous(breaks = seq(550, 1350, by = 100))+
  labs(title = "Frequency Polygons", x = "Life-spam (hours)", y = "Percentage")+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  stat_bin(binwidth = 100, geom = "text", color = "black",size = 2.95,
           aes(y = after_stat(count / sum(count)), 
               label = scales::percent(after_stat(count / sum(count)))),
           position = position_stack(vjust = 102))+
  stat_bin(data = k2, binwidth = 100, geom = "text", color = "black",size = 2.95,
           aes(y = after_stat(count / sum(count)), 
               label = scales::percent(after_stat(count / sum(count)))),
           position = position_stack(vjust = 102))
ggplotly(p3)

Cumulative Percentage Polygon Manufacturer 1

#frequency distribution of the life-spam intervals
m1 = k1$vida 
breaks = seq(600, 1100, by=100) 
m1.cut = cut(m1, breaks, right=FALSE) # "cut" function divides range of x in intervals
m1.freq = table(m1.cut)

mfreq1 = c(0, cumsum(m1.freq)/sum(m1.freq)*100) 
plot(breaks, mfreq1, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")   
lines(breaks, mfreq1, col = "red")
legend(x = "topleft",legend=c("manufacturer 1"),
       col=c("red"), lwd=3, cex=0.85)

Cumulative Percentage Polygon Manufacturer 2

m2 = k2$vida 
breaks = seq(800, 1200, by=100) 
m2.cut = cut(m2, breaks, right=FALSE) 
m2.freq = table(m2.cut)

mfreq2 = c(0, cumsum(m2.freq)/sum(m2.freq)*100) 
plot(breaks, mfreq2, xlab = element_blank(), ylab = element_blank())   
lines(breaks, mfreq2, col = "blue")
legend(x = "topleft",legend=c("manufacturer 2"),
       col=c("blue"), lwd=3, cex=0.85)

Both Cumulative Frequency plots

mfreq0 = c(0, cumsum(m1.freq)/sum(m1.freq)*100) 
plot(breaks, mfreq0, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")   
lines(breaks, mfreq0, col = "red")
par(new = TRUE) #creates multiple plots at once
mfreq1 = c(0, cumsum(m2.freq)/sum(m2.freq)*100) 
plot(breaks, mfreq1, xlab = element_blank(), ylab = element_blank())   
lines(breaks, mfreq1, col = "blue")
legend(x = "topleft",legend=c("manufacturer 1", "manufacturer 2"),
       col=c("red", "blue"), lwd=3:3, cex=0.85)

R project

Arthur Martins Ferreira de Sousa