Visualizing Data, Rearranging Observations, Summary Tables, Plotting Histograms, Percentage Polygon, and Cumulative Percentage Polygon
Importing Libraries
library(knitr)
library(kableExtra)
library(tidyverse)
library(gtsummary)
library(hrbrthemes)
library (plotly)
library(ggrepel)
library(kableExtra)
Reading Data-set
setwd("C:/Users/arthu/OneDrive/Área de Trabalho/R_projects/trabalhos de VED/Databases")
getwd()
data = readxl::read_xlsx("lampadas_trab3.xlsx")
df = data.frame(fabricante = data$Fabricante, vida = data$`Vida Útil` )
df2 = arrange(df, vida)
| Manufacturer | Life-spam |
|---|---|
| 1 | 684 |
| 1 | 697 |
| 1 | 720 |
| 1 | 773 |
| 2 | 819 |
| 1 | 821 |
| 1 | 831 |
| 1 | 835 |
| 2 | 836 |
| 1 | 848 |
| 1 | 852 |
| 1 | 852 |
| 1 | 859 |
| 1 | 860 |
| 1 | 868 |
| 1 | 870 |
| 1 | 876 |
| 2 | 888 |
| 1 | 893 |
| 2 | 897 |
| 1 | 899 |
| 2 | 903 |
| 1 | 905 |
| 2 | 907 |
| 1 | 909 |
| 1 | 911 |
| 2 | 912 |
| 2 | 918 |
| 1 | 922 |
| 1 | 924 |
| 1 | 926 |
| 1 | 926 |
| 1 | 938 |
| 1 | 939 |
| 2 | 942 |
| 1 | 943 |
| 2 | 943 |
| 1 | 946 |
| 2 | 952 |
| 1 | 954 |
| 2 | 959 |
| 2 | 962 |
| 1 | 971 |
| 1 | 972 |
| 1 | 977 |
| 1 | 984 |
| 2 | 986 |
| 2 | 992 |
| 2 | 994 |
| 2 | 1004 |
| 1 | 1005 |
| 2 | 1005 |
| 2 | 1007 |
| 1 | 1014 |
| 2 | 1015 |
| 1 | 1016 |
| 2 | 1016 |
| 2 | 1018 |
| 2 | 1020 |
| 2 | 1022 |
| 2 | 1034 |
| 2 | 1038 |
| 1 | 1041 |
| 1 | 1052 |
| 2 | 1072 |
| 2 | 1077 |
| 2 | 1077 |
| 1 | 1080 |
| 2 | 1082 |
| 1 | 1093 |
| 2 | 1096 |
| 2 | 1100 |
| 2 | 1113 |
| 2 | 1113 |
| 2 | 1116 |
| 2 | 1153 |
| 2 | 1154 |
| 2 | 1174 |
| 2 | 1188 |
| 2 | 1230 |
Selecting specific observations from data-frame column
k1 = df2[df2$fabricante==1,]
k2 = df2[df2$fabricante==2,]
# k_test = df2[df2$fabricante %in% c(1,2),] select and visualize more than one observation at the same time
# View(k_test)
Cumulative Frequency table - Manufacturer 1
k3 = k1 %>%
count(vida) %>%
mutate(Cum = (cumsum(n)/sum(n))*100) %>%
select(-n) %>%
right_join(k1) %>%
select(names(k1), everything())
| Manufacturer | Life-spam | Cumulative Percentage (%) |
|---|---|---|
| 1 | 684 | 2.5 |
| 1 | 697 | 5.0 |
| 1 | 720 | 7.5 |
| 1 | 773 | 10.0 |
| 1 | 821 | 12.5 |
| 1 | 831 | 15.0 |
| 1 | 835 | 17.5 |
| 1 | 848 | 20.0 |
| 1 | 852 | 25.0 |
| 1 | 852 | 25.0 |
| 1 | 859 | 27.5 |
| 1 | 860 | 30.0 |
| 1 | 868 | 32.5 |
| 1 | 870 | 35.0 |
| 1 | 876 | 37.5 |
| 1 | 893 | 40.0 |
| 1 | 899 | 42.5 |
| 1 | 905 | 45.0 |
| 1 | 909 | 47.5 |
| 1 | 911 | 50.0 |
| 1 | 922 | 52.5 |
| 1 | 924 | 55.0 |
| 1 | 926 | 60.0 |
| 1 | 926 | 60.0 |
| 1 | 938 | 62.5 |
| 1 | 939 | 65.0 |
| 1 | 943 | 67.5 |
| 1 | 946 | 70.0 |
| 1 | 954 | 72.5 |
| 1 | 971 | 75.0 |
| 1 | 972 | 77.5 |
| 1 | 977 | 80.0 |
| 1 | 984 | 82.5 |
| 1 | 1005 | 85.0 |
| 1 | 1014 | 87.5 |
| 1 | 1016 | 90.0 |
| 1 | 1041 | 92.5 |
| 1 | 1052 | 95.0 |
| 1 | 1080 | 97.5 |
| 1 | 1093 | 100.0 |
Cumulative Frequency table - Manufacturer 2
k4 = k2 %>%
count(vida) %>%
mutate(Cum = (cumsum(n)/sum(n))*100) %>%
select(-n) %>%
right_join(k2) %>%
select(names(k2), everything())
| Manufacturer | Life-spam | Cumulative Percentage (%) |
|---|---|---|
| 2 | 819 | 2.5 |
| 2 | 836 | 5.0 |
| 2 | 888 | 7.5 |
| 2 | 897 | 10.0 |
| 2 | 903 | 12.5 |
| 2 | 907 | 15.0 |
| 2 | 912 | 17.5 |
| 2 | 918 | 20.0 |
| 2 | 942 | 22.5 |
| 2 | 943 | 25.0 |
| 2 | 952 | 27.5 |
| 2 | 959 | 30.0 |
| 2 | 962 | 32.5 |
| 2 | 986 | 35.0 |
| 2 | 992 | 37.5 |
| 2 | 994 | 40.0 |
| 2 | 1004 | 42.5 |
| 2 | 1005 | 45.0 |
| 2 | 1007 | 47.5 |
| 2 | 1015 | 50.0 |
| 2 | 1016 | 52.5 |
| 2 | 1018 | 55.0 |
| 2 | 1020 | 57.5 |
| 2 | 1022 | 60.0 |
| 2 | 1034 | 62.5 |
| 2 | 1038 | 65.0 |
| 2 | 1072 | 67.5 |
| 2 | 1077 | 72.5 |
| 2 | 1077 | 72.5 |
| 2 | 1082 | 75.0 |
| 2 | 1096 | 77.5 |
| 2 | 1100 | 80.0 |
| 2 | 1113 | 85.0 |
| 2 | 1113 | 85.0 |
| 2 | 1116 | 87.5 |
| 2 | 1153 | 90.0 |
| 2 | 1154 | 92.5 |
| 2 | 1174 | 95.0 |
| 2 | 1188 | 97.5 |
| 2 | 1230 | 100.0 |
Grouping Life-spam ranges by specified intervals - Manufacturer 1
k5 = table(cut(k1$vida,seq(650,1150,100)))
| Interval | Frequency (nº observations) |
|---|---|
| (650,750] | 3 |
| (750,850] | 5 |
| (850,950] | 20 |
| (950,1.05e+03] | 9 |
| (1.05e+03,1.15e+03] | 3 |
Grouping Life-spam ranges by specified intervals - Manufacturer 2
k6 = table(cut(k2$vida, seq(750,1250,100)))
| Interval | Frequency (nº observations) |
|---|---|
| (750,850] | 2 |
| (850,950] | 8 |
| (950,1.05e+03] | 16 |
| (1.05e+03,1.15e+03] | 9 |
| (1.15e+03,1.25e+03] | 5 |
Summary Table
t = df2 %>%
drop_na(fabricante) %>%
group_by(fabricante) %>%
summarise(Lower = min(vida), Average = mean(vida), SD = sd(vida), VAR = var(vida),
Upper = max(vida), Difference = max(vida)-min(vida)) %>%
arrange(Average)
kable(t, caption = "Interval", align = "c") %>%
row_spec(0, font_size = 15) %>%
kable_styling()
| fabricante | Lower | Average | SD | VAR | Upper | Difference |
|---|---|---|---|---|---|---|
| 1 | 684 | 909.65 | 94.30516 | 8893.464 | 1093 | 409 |
| 2 | 819 | 1018.35 | 96.90136 | 9389.874 | 1230 | 411 |
Gtsummary for Summary Tables
df2 %>%
tbl_summary(label = c(vida ~ "LIFE-SPAM", fabricante ~ "MANUFACTURER"),
statistic = list(all_continuous() ~ "{mean} ({median}) ({sd}) ({var})",
all_categorical() ~ "{n} / {N} ({p}%)")) %>%
modify_header(label ~ "*Variables*") %>%
modify_spanning_header(label ~ "**LAMPS LIFESPAM**") %>%
modify_footnote(label ~ "percentage, mean, median, sd, var")
| LAMPS LIFESPAM | N = 802 |
|---|---|
| Variables1 | |
| MANUFACTURER | |
| 1 | 40 / 80 (50%) |
| 2 | 40 / 80 (50%) |
| LIFE-SPAM | 964 (957) (110) (12,017) |
| 1 percentage, mean, median, sd, var | |
| 2 n / N (%); Mean (Median) (SD) (Variance) | |
df2 %>%
select(c(fabricante)) %>%
tbl_summary()
| Characteristic | N = 801 |
|---|---|
| fabricante | |
| 1 | 40 (50%) |
| 2 | 40 (50%) |
| 1 n (%) | |
df2 %>%
tbl_summary(by = fabricante)%>%
add_p()
| Characteristic | 1, N = 401 | 2, N = 401 | p-value2 |
|---|---|---|---|
| vida | 917 (857, 971) | 1,016 (950, 1,086) | <0.001 |
| 1 Median (IQR) | |||
| 2 Wilcoxon rank sum test | |||
Histogram Life-spam Manufacturer 1
p1 = ggplot(k1, aes(x = vida)) +
geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100,
fill= "#69b3a2", color= "grey", alpha=0.8, linewidth = 0.5) +
stat_bin(
binwidth = 100, geom = "text", color = "white",
aes(y = after_stat(count / sum(count)),
label = scales::percent(after_stat(count / sum(count)))),
position = position_stack(vjust = 0.85)) +
labs(title = "Manufacturer 1", x = "Life-spam (hours)", y = "Percentage")+
scale_y_continuous(labels = scales::percent)+
scale_x_continuous(breaks = seq(650,1150, by = 100))+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
ggplotly(p1)
Histogram Life-spam Manufacturer 2
p2 = ggplot(k2, aes(x = vida)) +
geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100,
fill= "#69b3a2", color= "grey", alpha=0.9, linewidth= 0.5) +
stat_bin(binwidth = 100, geom = "text", color = "white",
aes(y = after_stat(count / sum(count)),
label = scales::percent(after_stat(count / sum(count)))),
position = position_stack(vjust = 0.75)) +
labs(title = "Manufacturer 2", x = "Life-spam (hours)", y = "Percentage")+
scale_y_continuous(labels = scales::percent)+
scale_x_continuous(breaks = seq(750,1250, by = 100))+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
ggplotly(p2)
Percentage Polygon from both Manufacturers
p3 = ggplot(k1, aes(x = vida, y=(..count../sum(count))*100, fill = fabricante))+
geom_freqpoly(color = "cyan", binwidth = 100)+
geom_freqpoly(data = k2, color = "#95FF00", binwidth = 100)+
scale_x_continuous(breaks = seq(550, 1350, by = 100))+
labs(title = "Frequency Polygons", x = "Life-spam (hours)", y = "Percentage")+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
stat_bin(binwidth = 100, geom = "text", color = "black",size = 2.95,
aes(y = after_stat(count / sum(count)),
label = scales::percent(after_stat(count / sum(count)))),
position = position_stack(vjust = 102))+
stat_bin(data = k2, binwidth = 100, geom = "text", color = "black",size = 2.95,
aes(y = after_stat(count / sum(count)),
label = scales::percent(after_stat(count / sum(count)))),
position = position_stack(vjust = 102))
ggplotly(p3)
Cumulative Percentage Polygon Manufacturer 1
#frequency distribution of the life-spam intervals
m1 = k1$vida
breaks = seq(600, 1100, by=100)
m1.cut = cut(m1, breaks, right=FALSE) # "cut" function divides range of x in intervals
m1.freq = table(m1.cut)
mfreq1 = c(0, cumsum(m1.freq)/sum(m1.freq)*100)
plot(breaks, mfreq1, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")
lines(breaks, mfreq1, col = "red")
legend(x = "topleft",legend=c("manufacturer 1"),
col=c("red"), lwd=3, cex=0.85)
Cumulative Percentage Polygon Manufacturer 2
m2 = k2$vida
breaks = seq(800, 1200, by=100)
m2.cut = cut(m2, breaks, right=FALSE)
m2.freq = table(m2.cut)
mfreq2 = c(0, cumsum(m2.freq)/sum(m2.freq)*100)
plot(breaks, mfreq2, xlab = element_blank(), ylab = element_blank())
lines(breaks, mfreq2, col = "blue")
legend(x = "topleft",legend=c("manufacturer 2"),
col=c("blue"), lwd=3, cex=0.85)
Both Cumulative Frequency plots
mfreq0 = c(0, cumsum(m1.freq)/sum(m1.freq)*100)
plot(breaks, mfreq0, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")
lines(breaks, mfreq0, col = "red")
par(new = TRUE) #creates multiple plots at once
mfreq1 = c(0, cumsum(m2.freq)/sum(m2.freq)*100)
plot(breaks, mfreq1, xlab = element_blank(), ylab = element_blank())
lines(breaks, mfreq1, col = "blue")
legend(x = "topleft",legend=c("manufacturer 1", "manufacturer 2"),
col=c("red", "blue"), lwd=3:3, cex=0.85)