R project

Arthur Martins Ferreira de Sousa

Visualizing Data, Rearranging Observations, Summary Tables, Plotting Histograms, Percentage Polygon, and Cumulative Percentage Polygon

Importing Libraries

library(knitr)
library(kableExtra)
library(tidyverse)
library(gtsummary)
library(hrbrthemes)
library (plotly)
library(ggrepel)
library(kableExtra)

Reading Data-set

setwd("C:/Users/arthu/OneDrive/Área de Trabalho/R_projects/trabalhos de VED/Databases")
getwd()
data = readxl::read_xlsx("lampadas_trab3.xlsx")
df = data.frame(fabricante = data$Fabricante, vida = data$`Vida Útil` )
df2 = arrange(df, vida)
Data-set
Manufacturer Life-spam
1 684
1 697
1 720
1 773
2 819
1 821
1 831
1 835
2 836
1 848
1 852
1 852
1 859
1 860
1 868
1 870
1 876
2 888
1 893
2 897
1 899
2 903
1 905
2 907
1 909
1 911
2 912
2 918
1 922
1 924
1 926
1 926
1 938
1 939
2 942
1 943
2 943
1 946
2 952
1 954
2 959
2 962
1 971
1 972
1 977
1 984
2 986
2 992
2 994
2 1004
1 1005
2 1005
2 1007
1 1014
2 1015
1 1016
2 1016
2 1018
2 1020
2 1022
2 1034
2 1038
1 1041
1 1052
2 1072
2 1077
2 1077
1 1080
2 1082
1 1093
2 1096
2 1100
2 1113
2 1113
2 1116
2 1153
2 1154
2 1174
2 1188
2 1230

Selecting specific observations from data-frame column

k1 = df2[df2$fabricante==1,]

k2 = df2[df2$fabricante==2,]

# k_test = df2[df2$fabricante %in% c(1,2),] select and visualize more than one observation at the same time
# View(k_test)

Cumulative Frequency table - Manufacturer 1

k3 = k1 %>% 
  count(vida) %>% 
  mutate(Cum = (cumsum(n)/sum(n))*100) %>% 
  select(-n) %>% 
  right_join(k1) %>%
  select(names(k1), everything())
Data-set + Cumulative Freq
Manufacturer Life-spam Cumulative Percentage (%)
1 684 2.5
1 697 5.0
1 720 7.5
1 773 10.0
1 821 12.5
1 831 15.0
1 835 17.5
1 848 20.0
1 852 25.0
1 852 25.0
1 859 27.5
1 860 30.0
1 868 32.5
1 870 35.0
1 876 37.5
1 893 40.0
1 899 42.5
1 905 45.0
1 909 47.5
1 911 50.0
1 922 52.5
1 924 55.0
1 926 60.0
1 926 60.0
1 938 62.5
1 939 65.0
1 943 67.5
1 946 70.0
1 954 72.5
1 971 75.0
1 972 77.5
1 977 80.0
1 984 82.5
1 1005 85.0
1 1014 87.5
1 1016 90.0
1 1041 92.5
1 1052 95.0
1 1080 97.5
1 1093 100.0

Cumulative Frequency table - Manufacturer 2

k4 = k2 %>% 
  count(vida) %>% 
  mutate(Cum = (cumsum(n)/sum(n))*100) %>% 
  select(-n) %>% 
  right_join(k2) %>%
  select(names(k2), everything())
Data-set + Cumulative Freq
Manufacturer Life-spam Cumulative Percentage (%)
2 819 2.5
2 836 5.0
2 888 7.5
2 897 10.0
2 903 12.5
2 907 15.0
2 912 17.5
2 918 20.0
2 942 22.5
2 943 25.0
2 952 27.5
2 959 30.0
2 962 32.5
2 986 35.0
2 992 37.5
2 994 40.0
2 1004 42.5
2 1005 45.0
2 1007 47.5
2 1015 50.0
2 1016 52.5
2 1018 55.0
2 1020 57.5
2 1022 60.0
2 1034 62.5
2 1038 65.0
2 1072 67.5
2 1077 72.5
2 1077 72.5
2 1082 75.0
2 1096 77.5
2 1100 80.0
2 1113 85.0
2 1113 85.0
2 1116 87.5
2 1153 90.0
2 1154 92.5
2 1174 95.0
2 1188 97.5
2 1230 100.0

Grouping Life-spam ranges by specified intervals - Manufacturer 1

k5 = table(cut(k1$vida,seq(650,1150,100)))
Interval
Interval Frequency (nº observations)
(650,750] 3
(750,850] 5
(850,950] 20
(950,1.05e+03] 9
(1.05e+03,1.15e+03] 3

Grouping Life-spam ranges by specified intervals - Manufacturer 2

k6 = table(cut(k2$vida, seq(750,1250,100)))
Interval
Interval Frequency (nº observations)
(750,850] 2
(850,950] 8
(950,1.05e+03] 16
(1.05e+03,1.15e+03] 9
(1.15e+03,1.25e+03] 5

Summary Table

t = df2 %>%
  drop_na(fabricante) %>%
  group_by(fabricante) %>%
  summarise(Lower = min(vida), Average = mean(vida), SD = sd(vida), VAR = var(vida), 
            Upper = max(vida), Difference = max(vida)-min(vida)) %>%
  arrange(Average)
kable(t, caption = "Interval", align = "c") %>%
  row_spec(0, font_size = 15) %>%
  kable_styling()
Interval
fabricante Lower Average SD VAR Upper Difference
1 684 909.65 94.30516 8893.464 1093 409
2 819 1018.35 96.90136 9389.874 1230 411

Gtsummary for Summary Tables

df2 %>% 
  tbl_summary(label = c(vida ~ "LIFE-SPAM", fabricante ~ "MANUFACTURER"), 
                    statistic = list(all_continuous() ~ "{mean} ({median}) ({sd}) ({var})",
                                     all_categorical() ~ "{n} / {N} ({p}%)")) %>%
  modify_header(label ~ "*Variables*") %>%
  modify_spanning_header(label ~ "**LAMPS LIFESPAM**") %>%
  modify_footnote(label ~ "percentage, mean, median, sd, var")
LAMPS LIFESPAM N = 802
Variables1
MANUFACTURER
    1 40 / 80 (50%)
    2 40 / 80 (50%)
LIFE-SPAM 964 (957) (110) (12,017)
1 percentage, mean, median, sd, var
2 n / N (%); Mean (Median) (SD) (Variance)
df2 %>% 
  select(c(fabricante)) %>% 
  tbl_summary()
Characteristic N = 801
fabricante
    1 40 (50%)
    2 40 (50%)
1 n (%)
df2 %>% 
  tbl_summary(by = fabricante)%>% 
  add_p()
Characteristic 1, N = 401 2, N = 401 p-value2
vida 917 (857, 971) 1,016 (950, 1,086) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test

Histogram Life-spam Manufacturer 1

p1 = ggplot(k1, aes(x = vida)) +
  geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100, 
                 fill= "#69b3a2", color= "grey", alpha=0.8, linewidth = 0.5) +
  stat_bin(
    binwidth = 100, geom = "text", color = "white",
    aes(y = after_stat(count / sum(count)), 
        label = scales::percent(after_stat(count / sum(count)))),
    position = position_stack(vjust = 0.85)) +
  labs(title = "Manufacturer 1", x = "Life-spam (hours)", y = "Percentage")+
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous(breaks = seq(650,1150, by = 100))+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()
ggplotly(p1)

Histogram Life-spam Manufacturer 2

p2 = ggplot(k2, aes(x = vida)) +
  geom_histogram(aes(vida, y = after_stat(count / sum(count))), binwidth = 100, 
                  fill= "#69b3a2", color= "grey", alpha=0.9, linewidth= 0.5) +
  stat_bin(binwidth = 100, geom = "text", color = "white",
    aes(y = after_stat(count / sum(count)), 
        label = scales::percent(after_stat(count / sum(count)))),
          position = position_stack(vjust = 0.75)) +
  labs(title = "Manufacturer 2", x = "Life-spam (hours)", y = "Percentage")+
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous(breaks = seq(750,1250, by = 100))+
  theme(plot.title = element_text(hjust = 0.5))+
  theme_bw()
ggplotly(p2)

Percentage Polygon from both Manufacturers

p3 = ggplot(k1, aes(x = vida, y=(..count../sum(count))*100, fill = fabricante))+
  geom_freqpoly(color = "cyan", binwidth = 100)+
  geom_freqpoly(data = k2, color = "#95FF00", binwidth = 100)+
  scale_x_continuous(breaks = seq(550, 1350, by = 100))+
  labs(title = "Frequency Polygons", x = "Life-spam (hours)", y = "Percentage")+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  stat_bin(binwidth = 100, geom = "text", color = "black",size = 2.95,
           aes(y = after_stat(count / sum(count)), 
               label = scales::percent(after_stat(count / sum(count)))),
           position = position_stack(vjust = 102))+
  stat_bin(data = k2, binwidth = 100, geom = "text", color = "black",size = 2.95,
           aes(y = after_stat(count / sum(count)), 
               label = scales::percent(after_stat(count / sum(count)))),
           position = position_stack(vjust = 102))
ggplotly(p3)

Cumulative Percentage Polygon Manufacturer 1

#frequency distribution of the life-spam intervals
m1 = k1$vida 
breaks = seq(600, 1100, by=100) 
m1.cut = cut(m1, breaks, right=FALSE) # "cut" function divides range of x in intervals
m1.freq = table(m1.cut)

mfreq1 = c(0, cumsum(m1.freq)/sum(m1.freq)*100) 
plot(breaks, mfreq1, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")   
lines(breaks, mfreq1, col = "red")
legend(x = "topleft",legend=c("manufacturer 1"),
       col=c("red"), lwd=3, cex=0.85)

Cumulative Percentage Polygon Manufacturer 2

m2 = k2$vida 
breaks = seq(800, 1200, by=100) 
m2.cut = cut(m2, breaks, right=FALSE) 
m2.freq = table(m2.cut)

mfreq2 = c(0, cumsum(m2.freq)/sum(m2.freq)*100) 
plot(breaks, mfreq2, xlab = element_blank(), ylab = element_blank())   
lines(breaks, mfreq2, col = "blue")
legend(x = "topleft",legend=c("manufacturer 2"),
       col=c("blue"), lwd=3, cex=0.85)

Both Cumulative Frequency plots

mfreq0 = c(0, cumsum(m1.freq)/sum(m1.freq)*100) 
plot(breaks, mfreq0, main="Cumulative Percentage Polygon", xlab="Life-spam (hours)", ylab="Percentage")   
lines(breaks, mfreq0, col = "red")
par(new = TRUE) #creates multiple plots at once
mfreq1 = c(0, cumsum(m2.freq)/sum(m2.freq)*100) 
plot(breaks, mfreq1, xlab = element_blank(), ylab = element_blank())   
lines(breaks, mfreq1, col = "blue")
legend(x = "topleft",legend=c("manufacturer 1", "manufacturer 2"),
       col=c("red", "blue"), lwd=3:3, cex=0.85)