library("pacman")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("dplyr")
library("ggplot2")
theUrl <- "https://raw.githubusercontent.com/enidroman/R-Programing/main/ShipAccidents.csv"
ship_accidents <- read.table(file=theUrl, header=TRUE, sep=",")
head(ship_accidents)
## X type construction operation service incidents
## 1 1 A 1960-64 1960-74 127 0
## 2 2 A 1960-64 1975-79 63 0
## 3 3 A 1965-69 1960-74 1095 3
## 4 4 A 1965-69 1975-79 1095 4
## 5 5 A 1970-74 1960-74 1512 6
## 6 6 A 1970-74 1975-79 3353 18
str(ship_accidents, vec.len = 1)
## 'data.frame': 40 obs. of 6 variables:
## $ X : int 1 2 ...
## $ type : chr "A" ...
## $ construction: chr "1960-64" ...
## $ operation : chr "1960-74" ...
## $ service : int 127 63 ...
## $ incidents : int 0 0 ...
summary(ship_accidents)
## X type construction operation
## Min. : 1.00 Length:40 Length:40 Length:40
## 1st Qu.:10.75 Class :character Class :character Class :character
## Median :20.50 Mode :character Mode :character Mode :character
## Mean :20.50
## 3rd Qu.:30.25
## Max. :40.00
## service incidents
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 175.8 1st Qu.: 0.0
## Median : 782.0 Median : 2.0
## Mean : 4089.3 Mean : 8.9
## 3rd Qu.: 2078.5 3rd Qu.:11.0
## Max. :44882.0 Max. :58.0
ship_accidents %>%
group_by(type) %>%
summarise(
mean_service = mean(service, na.rm=T),
mean_incidents = mean(incidents, na.rm=T),
max_service = max(service, na.rm=T),
max_incidents = max(incidents, na.rm=T),
min_service = min(service, na.rm=T),
min_incidents = min(incidents, na.rm=T))
## # A tibble: 5 × 7
## type mean_service mean_incidents max_service max_incidents min_serv…¹ min_i…²
## <chr> <dbl> <dbl> <int> <int> <int> <int>
## 1 A 1186. 5.25 3353 18 0 0
## 2 B 17290. 31.6 44882 58 0 0
## 3 C 774. 1.5 1948 6 0 0
## 4 D 556. 2.12 2051 11 0 0
## 5 E 641. 4 2161 12 0 0
## # … with abbreviated variable names ¹min_service, ²min_incidents
tapply(ship_accidents$service, ship_accidents$type, summary)
## $A
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 111 1095 1186 1695 3353
##
## $B
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 7104 15138 17290 22430 44882
##
## $C
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 482.5 728.5 774.1 882.0 1948.0
##
## $D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 170.2 269.5 555.5 563.8 2051.0
##
## $E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 33.75 489.50 641.38 881.00 2161.00
tapply(ship_accidents$incidents, ship_accidents$type, summary)
## $A
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 3.50 5.25 7.25 18.00
##
## $B
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 16.50 34.00 31.62 46.25 58.00
##
## $C
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.75 1.00 1.50 1.25 6.00
##
## $D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.125 2.500 11.000
##
## $E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 3 4 7 12
mean(ship_accidents$'service')
## [1] 4089.35
max(ship_accidents$'service')
## [1] 44882
min(ship_accidents$'service')
## [1] 0
mean(ship_accidents$'incidents')
## [1] 8.9
max(ship_accidents$'incidents')
## [1] 58
min(ship_accidents$'incidents')
## [1] 0
ship_accidents_sd <- ship_accidents %>%
group_by(type) %>%
summarise_at(vars(service),
list(mean = mean,
sd = sd)) %>%
as.data.frame()
ship_accidents_sd
## type mean sd
## 1 A 1186.125 1178.8188
## 2 B 17289.625 14260.3707
## 3 C 774.125 591.6557
## 4 D 555.500 708.4044
## 5 E 641.375 739.3968
ggplot(ship_accidents_sd,
aes(x = type,
y = mean)) +
geom_errorbar(aes(ymin = mean - sd,
ymax = mean + sd)) +
geom_point()
# Mean & standard deviation by ship type for incidents.
ship_accidents_sd <- ship_accidents %>%
group_by(type) %>%
summarise_at(vars(incidents),
list(mean = mean,
sd = sd)) %>%
as.data.frame()
ship_accidents_sd
## type mean sd
## 1 A 5.250 6.386369
## 2 B 31.625 20.486494
## 3 C 1.500 1.927248
## 4 D 2.125 3.870677
## 5 E 4.000 4.472136
ggplot(ship_accidents_sd,
aes(x = type,
y = mean)) +
geom_errorbar(aes(ymin = mean - sd,
ymax = mean + sd)) +
geom_point()
# Total of service and incidents for each ship type.
ship_accidents_newgroup <- ship_accidents
ship_accidents_newgroup %>%
group_by(type) %>%
summarise(
sum_service = sum(service, na.rm=T),
sum_incidents = sum(incidents, na.rm=T))
## # A tibble: 5 × 3
## type sum_service sum_incidents
## <chr> <int> <int>
## 1 A 9489 42
## 2 B 138317 253
## 3 C 6193 12
## 4 D 4444 17
## 5 E 5131 32
ggplot(ship_accidents_newgroup, aes(y = service, x = type)) + geom_boxplot()
# Graph with ggplot2 plot with total incidents by ship type. Again Ship
Type B stands out.
ggplot(ship_accidents_newgroup, aes(y = incidents, x = type)) + geom_boxplot()
# Here I began to do some cleaning of the dataset. # I checked the
columns.
colnames(ship_accidents)
## [1] "X" "type" "construction" "operation" "service"
## [6] "incidents"
#I changed some of the names of the columns and captialized all letters.
colnames(ship_accidents)[1:6] = c("ID", "SHIP_TYPE", "CONSTRUCTION_YEARS", "OPERATION_YEARS", "TOTAL_SERVICE", "TOTAL_INCIDENTS")
head(ship_accidents, 10)
## ID SHIP_TYPE CONSTRUCTION_YEARS OPERATION_YEARS TOTAL_SERVICE
## 1 1 A 1960-64 1960-74 127
## 2 2 A 1960-64 1975-79 63
## 3 3 A 1965-69 1960-74 1095
## 4 4 A 1965-69 1975-79 1095
## 5 5 A 1970-74 1960-74 1512
## 6 6 A 1970-74 1975-79 3353
## 7 7 A 1975-79 1960-74 0
## 8 8 A 1975-79 1975-79 2244
## 9 9 B 1960-64 1960-74 44882
## 10 10 B 1960-64 1975-79 17176
## TOTAL_INCIDENTS
## 1 0
## 2 0
## 3 3
## 4 4
## 5 6
## 6 18
## 7 0
## 8 11
## 9 39
## 10 29
ship_accidents[c(7, 15, 23, 31, 34, 39), ]
## ID SHIP_TYPE CONSTRUCTION_YEARS OPERATION_YEARS TOTAL_SERVICE
## 7 7 A 1975-79 1960-74 0
## 15 15 B 1975-79 1960-74 0
## 23 23 C 1975-79 1960-74 0
## 31 31 D 1975-79 1960-74 0
## 34 34 E 1960-64 1975-79 0
## 39 39 E 1975-79 1960-74 0
## TOTAL_INCIDENTS
## 7 0
## 15 0
## 23 0
## 31 0
## 34 0
## 39 0
#ship_accidents_new <- ship_accidents
ship_accidents_new = ship_accidents[-c(7, 15, 3, 31, 34, 39),-c(3:4)]
head(ship_accidents_new, 10)
## ID SHIP_TYPE TOTAL_SERVICE TOTAL_INCIDENTS
## 1 1 A 127 0
## 2 2 A 63 0
## 4 4 A 1095 4
## 5 5 A 1512 6
## 6 6 A 3353 18
## 8 8 A 2244 11
## 9 9 B 44882 39
## 10 10 B 17176 29
## 11 11 B 28609 58
## 12 12 B 20370 53
mean(ship_accidents_new$'TOTAL_SERVICE')
## [1] 4778.794
max(ship_accidents_new$'TOTAL_SERVICE')
## [1] 44882
min(ship_accidents_new$'TOTAL_SERVICE')
## [1] 0
mean(ship_accidents_new$'TOTAL_INCIDENTS')
## [1] 10.38235
max(ship_accidents_new$'TOTAL_INCIDENTS')
## [1] 58
min(ship_accidents_new$'TOTAL_INCIDENTS')
## [1] 0
ship_accidents_sd_new <- ship_accidents_new %>%
group_by(`SHIP_TYPE`) %>%
summarise_at(vars(`TOTAL_SERVICE`),
list(MEAN = mean,
SD = sd)) %>%
as.data.frame()
ship_accidents_sd_new
## SHIP_TYPE MEAN SD
## 1 A 1399.0000 1268.0856
## 2 B 19759.5714 13428.0281
## 3 C 774.1250 591.6557
## 4 D 634.8571 725.7404
## 5 E 855.1667 738.9166
ggplot(ship_accidents_sd_new,
aes(x = SHIP_TYPE,
y = MEAN)) +
geom_errorbar(aes(ymin = MEAN - SD,
ymax = MEAN + SD)) +
geom_point()
SHIP TYPE MEAN SD A - 1.25 .59217 B - 4.514857 -3.190747 C - 0 0 D - 0,303571 0.20597 E - 1.333333 -0.060031
ship_accidents_sd_new <- ship_accidents_new %>%
group_by(`SHIP_TYPE`) %>%
summarise_at(vars(`TOTAL_INCIDENTS`),
list(MEAN = mean,
SD = sd)) %>%
as.data.frame()
ship_accidents_sd_new
## SHIP_TYPE MEAN SD
## 1 A 6.500000 6.978539
## 2 B 36.142857 17.295747
## 3 C 1.500000 1.927248
## 4 D 2.428571 4.076647
## 5 E 5.333333 4.412105
ggplot(ship_accidents_sd_new,
aes(x = SHIP_TYPE,
y = MEAN)) +
geom_errorbar(aes(ymin = MEAN - SD,
ymax = MEAN + SD)) +
geom_point()
# I wanted to reset the index but was unsuccessful in doing so.
rownames(ship_accidents_new) <- 1:nrow(ship_accidents_new)
ship_accidents_new
## ID SHIP_TYPE TOTAL_SERVICE TOTAL_INCIDENTS
## 1 1 A 127 0
## 2 2 A 63 0
## 3 4 A 1095 4
## 4 5 A 1512 6
## 5 6 A 3353 18
## 6 8 A 2244 11
## 7 9 B 44882 39
## 8 10 B 17176 29
## 9 11 B 28609 58
## 10 12 B 20370 53
## 11 13 B 7064 12
## 12 14 B 13099 44
## 13 16 B 7117 18
## 14 17 C 1179 1
## 15 18 C 552 1
## 16 19 C 781 0
## 17 20 C 676 1
## 18 21 C 783 6
## 19 22 C 1948 2
## 20 23 C 0 0
## 21 24 C 274 1
## 22 25 D 251 0
## 23 26 D 105 0
## 24 27 D 288 0
## 25 28 D 192 0
## 26 29 D 349 2
## 27 30 D 1208 11
## 28 32 D 2051 4
## 29 33 E 45 0
## 30 35 E 789 7
## 31 36 E 437 7
## 32 37 E 1157 5
## 33 38 E 2161 12
## 34 40 E 542 1
attach(ship_accidents_new)
plot(TOTAL_SERVICE, TOTAL_INCIDENTS, main="SHIP ACCIDENTS",
xlab="TOTAL_SERVICE ", ylab="TOTAL_INCIDENTS ", pch=19)
abline(lm(TOTAL_SERVICE ~ TOTAL_INCIDENTS), col="red") #regression line (y~x)
lines(lowess(TOTAL_SERVICE, TOTAL_INCIDENTS), col="blue") #lowess line (x~y)
text(TOTAL_SERVICE, TOTAL_INCIDENTS, SHIP_TYPE, pos=1)
# Histogram Graph for Total Service for each individually Ship Type.
hist(ship_accidents_new$`TOTAL_SERVICE`)
# Histogram Graph for Total Incidents for each individual Ship Type.
hist(ship_accidents_new$`TOTAL_INCIDENTS`)