# ---
# title: Himalayan Database Final Project
# author: Tibor Balazs
# date: January 29, 2025
# output:
# html_document:
# toc: true
# theme: united
# ---
## Introduction
#
# The Himalayan Database is a comprehensive archive documenting mountaineering
# expeditions in the Nepal Himalaya. It was originally compiled by Elizabeth
# Hawley and details peak characteristics, expedition information, success rates,
# and more.
#
## Setup
#
# I read in two cleaned CSV files directly from the GitHub link. These files
# contain expedition and peak data. I then convert them into data.table
# objects to allow for efficient filtering and aggregation operations.
#+ packages_and_data
library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(RColorBrewer)
# URLs pointing to TidyTuesday GitHub (2025-01-21)
exped_url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-01-21/exped_tidy.csv"
peaks_url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-01-21/peaks_tidy.csv"
# Read directly from GitHub
exped_tidy <- read_csv(exped_url)
## Rows: 882 Columns: 69
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): EXPID, PEAKID, SEASON_FACTOR, HOST_FACTOR, ROUTE1, ROUTE2, NATION...
## dbl (17): YEAR, SEASON, HOST, SMTDAYS, TOTDAYS, TERMREASON, HIGHPOINT, CAMP...
## lgl (27): ROUTE3, ROUTE4, SUCCESS1, SUCCESS2, SUCCESS3, SUCCESS4, ASCENT3, ...
## date (3): BCDATE, SMTDATE, TERMDATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
peaks_tidy <- read_csv(peaks_url)
## Rows: 480 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): PEAKID, PKNAME, PKNAME2, LOCATION, HIMAL_FACTOR, REGION_FACTOR, RE...
## dbl (12): HEIGHTM, HEIGHTF, HIMAL, REGION, TREKYEAR, PHOST, PSTATUS, PEAKMEM...
## lgl (3): OPEN, UNLISTED, TREKKING
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert to data.table
exped_dt <- as.data.table(exped_tidy)
peaks_dt <- as.data.table(peaks_tidy)
# Briefly inspect each dataset
str(exped_dt)
## Classes 'data.table' and 'data.frame': 882 obs. of 69 variables:
## $ EXPID : chr "EVER20101" "EVER20102" "EVER20103" "AMAD20301" ...
## $ PEAKID : chr "EVER" "EVER" "EVER" "AMAD" ...
## $ YEAR : num 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ SEASON : num 1 1 1 3 3 3 3 3 3 3 ...
## $ SEASON_FACTOR : chr "Spring" "Spring" "Spring" "Autumn" ...
## $ HOST : num 2 2 2 1 1 1 1 1 1 1 ...
## $ HOST_FACTOR : chr "China" "China" "China" "Nepal" ...
## $ ROUTE1 : chr "N Col-NE Ridge" "N Col-NE Ridge" "N Col-NE Ridge" "SW Ridge" ...
## $ ROUTE2 : chr NA NA NA NA ...
## $ ROUTE3 : logi NA NA NA NA NA NA ...
## $ ROUTE4 : logi NA NA NA NA NA NA ...
## $ NATION : chr "China" "China" "China" "Nepal" ...
## $ LEADERS : chr "Tibetan Rope-Fixing" "Ci Luo (Tselo)" "Tsering Samdrup" "Chhang Dawa Sherpa" ...
## $ SPONSOR : chr "Tibetan Rope-Fixing Everest North 2020" "Chinese Mount Everest Survey Team" "Holy Mountain Adventure Everest Expedition 2020" "Seven Summit Treks Ama Dablam Expedition 2020" ...
## $ SUCCESS1 : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ SUCCESS2 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ SUCCESS3 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ SUCCESS4 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ASCENT1 : chr NA NA NA NA ...
## $ ASCENT2 : chr NA NA NA NA ...
## $ ASCENT3 : logi NA NA NA NA NA NA ...
## $ ASCENT4 : logi NA NA NA NA NA NA ...
## $ CLAIMED : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ DISPUTED : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ COUNTRIES : chr NA NA NA "Canada, Czech Republic, France, Poland, Russia, Switzerland, Ukraine, USA" ...
## $ APPROACH : chr "Lhasa->Tingri->Everest BC" NA "Lhasa->Tingri->Everest BC" NA ...
## $ BCDATE : Date, format: NA NA ...
## $ SMTDATE : Date, format: "2020-05-26" "2020-05-27" ...
## $ SMTTIME : chr "1515" "0945" "0545" "1300" ...
## $ SMTDAYS : num 0 0 35 1 9 16 11 3 0 10 ...
## $ TOTDAYS : num 0 0 38 0 11 17 13 4 0 0 ...
## $ TERMDATE : Date, format: NA NA ...
## $ TERMREASON : num 1 1 1 1 1 1 4 1 12 1 ...
## $ TERMREASON_FACTOR: chr "Success (main peak)" "Success (main peak)" "Success (main peak)" "Success (main peak)" ...
## $ TERMNOTE : chr NA NA NA NA ...
## $ HIGHPOINT : num 8849 8849 8849 6814 6814 ...
## $ TRAVERSE : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ SKI : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ PARAPENTE : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ CAMPS : num 3 3 3 2 2 2 2 2 0 2 ...
## $ ROPE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TOTMEMBERS : num 0 12 20 14 6 2 4 1 1 6 ...
## $ SMTMEMBERS : num 0 8 14 9 6 2 0 1 0 1 ...
## $ MDEATHS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TOTHIRED : num 6 0 22 19 8 1 2 1 0 6 ...
## $ SMTHIRED : num 6 0 21 14 8 1 0 1 0 3 ...
## $ HDEATHS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NOHIRED : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ O2USED : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ O2NONE : logi FALSE FALSE FALSE TRUE TRUE TRUE ...
## $ O2CLIMB : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ O2DESCENT : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ O2SLEEP : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ O2MEDICAL : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ O2TAKEN : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ O2UNKWN : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ OTHERSMTS : chr NA NA NA NA ...
## $ CAMPSITES : chr "BC,ABC,C1,C2,C3,Smt(26/05)" "BC,ABC,C1,C2,C3,Smt(27/05)" "BC(23/04,5200m),IC(26/04,5800m),ABC(05/01,6500m),C1(25/05,7028m),C2(26/05,7790m),C3(27/05,8300m),Smt(28/05)" "BC(09/11,4450m),C1(5600m),C2(5900m),Smt(10,12-13,15/11)" ...
## $ ROUTEMEMO : num NA 221011 203869 NA 29755 ...
## $ ACCIDENTS : chr NA NA NA NA ...
## $ ACHIEVMENT : chr NA NA NA NA ...
## $ AGENCY : chr "Holy Mountain Adventure" NA "Holy Mountain Adventure" "Seven Summit Treks" ...
## $ COMRTE : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ STDRTE : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ PRIMRTE : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ PRIMMEM : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ PRIMREF : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ PRIMID : chr NA NA NA NA ...
## $ CHKSUM : num 2465291 2465292 2465293 2463299 2463299 ...
## - attr(*, "spec")=
## .. cols(
## .. EXPID = col_character(),
## .. PEAKID = col_character(),
## .. YEAR = col_double(),
## .. SEASON = col_double(),
## .. SEASON_FACTOR = col_character(),
## .. HOST = col_double(),
## .. HOST_FACTOR = col_character(),
## .. ROUTE1 = col_character(),
## .. ROUTE2 = col_character(),
## .. ROUTE3 = col_logical(),
## .. ROUTE4 = col_logical(),
## .. NATION = col_character(),
## .. LEADERS = col_character(),
## .. SPONSOR = col_character(),
## .. SUCCESS1 = col_logical(),
## .. SUCCESS2 = col_logical(),
## .. SUCCESS3 = col_logical(),
## .. SUCCESS4 = col_logical(),
## .. ASCENT1 = col_character(),
## .. ASCENT2 = col_character(),
## .. ASCENT3 = col_logical(),
## .. ASCENT4 = col_logical(),
## .. CLAIMED = col_logical(),
## .. DISPUTED = col_logical(),
## .. COUNTRIES = col_character(),
## .. APPROACH = col_character(),
## .. BCDATE = col_date(format = ""),
## .. SMTDATE = col_date(format = ""),
## .. SMTTIME = col_character(),
## .. SMTDAYS = col_double(),
## .. TOTDAYS = col_double(),
## .. TERMDATE = col_date(format = ""),
## .. TERMREASON = col_double(),
## .. TERMREASON_FACTOR = col_character(),
## .. TERMNOTE = col_character(),
## .. HIGHPOINT = col_double(),
## .. TRAVERSE = col_logical(),
## .. SKI = col_logical(),
## .. PARAPENTE = col_logical(),
## .. CAMPS = col_double(),
## .. ROPE = col_double(),
## .. TOTMEMBERS = col_double(),
## .. SMTMEMBERS = col_double(),
## .. MDEATHS = col_double(),
## .. TOTHIRED = col_double(),
## .. SMTHIRED = col_double(),
## .. HDEATHS = col_double(),
## .. NOHIRED = col_logical(),
## .. O2USED = col_logical(),
## .. O2NONE = col_logical(),
## .. O2CLIMB = col_logical(),
## .. O2DESCENT = col_logical(),
## .. O2SLEEP = col_logical(),
## .. O2MEDICAL = col_logical(),
## .. O2TAKEN = col_logical(),
## .. O2UNKWN = col_logical(),
## .. OTHERSMTS = col_character(),
## .. CAMPSITES = col_character(),
## .. ROUTEMEMO = col_double(),
## .. ACCIDENTS = col_character(),
## .. ACHIEVMENT = col_character(),
## .. AGENCY = col_character(),
## .. COMRTE = col_logical(),
## .. STDRTE = col_logical(),
## .. PRIMRTE = col_logical(),
## .. PRIMMEM = col_logical(),
## .. PRIMREF = col_logical(),
## .. PRIMID = col_character(),
## .. CHKSUM = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
## - attr(*, ".internal.selfref")=<externalptr>
str(peaks_dt)
## Classes 'data.table' and 'data.frame': 480 obs. of 29 variables:
## $ PEAKID : chr "AMAD" "AMPG" "ANN1" "ANN2" ...
## $ PKNAME : chr "Ama Dablam" "Amphu Gyabjen" "Annapurna I" "Annapurna II" ...
## $ PKNAME2 : chr "Amai Dablang" "Amphu Gyabien" NA NA ...
## $ LOCATION : chr "Khumbu Himal" "Khumbu Himal (N of Ama Dablam)" "Annapurna Himal" "Annapurna Himal" ...
## $ HEIGHTM : num 6814 5630 8091 7937 7555 ...
## $ HEIGHTF : num 22356 18471 26545 26040 24787 ...
## $ HIMAL : num 12 12 1 1 1 1 1 1 1 2 ...
## $ HIMAL_FACTOR : chr "Khumbu" "Khumbu" "Annapurna" "Annapurna" ...
## $ REGION : num 2 2 5 5 5 5 5 5 5 7 ...
## $ REGION_FACTOR : chr "Khumbu-Rolwaling-Makalu" "Khumbu-Rolwaling-Makalu" "Annapurna-Damodar-Peri" "Annapurna-Damodar-Peri" ...
## $ OPEN : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ UNLISTED : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TREKKING : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TREKYEAR : num NA NA NA NA NA NA NA NA NA NA ...
## $ RESTRICT : chr NA "Opened in 2002" NA NA ...
## $ PHOST : num 1 1 1 1 1 1 1 1 1 1 ...
## $ PHOST_FACTOR : chr "Nepal only" "Nepal only" "Nepal only" "Nepal only" ...
## $ PSTATUS : num 2 2 2 2 2 2 2 2 2 2 ...
## $ PSTATUS_FACTOR: chr "Climbed" "Climbed" "Climbed" "Climbed" ...
## $ PEAKMEMO : num 8 20 23 31 35 40 44 46 48 54 ...
## $ PYEAR : num 1961 1953 1950 1960 1961 ...
## $ PSEASON : num 1 1 1 1 1 1 1 3 3 1 ...
## $ PEXPID : chr "AMAD61101" "AMPG53101" "ANN150101" "ANN260101" ...
## $ PSMTDATE : chr "Mar 13" "Apr 11" "Jun 03" "May 17" ...
## $ PCOUNTRY : chr "New Zealand, USA, UK" "UK" "France" "UK, Nepal" ...
## $ PSUMMITERS : chr "Mike Gill, Wally Romanes, Barry Bishop, Michael Ward" "John Hunt, Tom Bourdillon" "Maurice Herzog, Louis Lachenal" "Richard Grant, Chris Bonington, Ang Nyima Sherpa" ...
## $ PSMTNOTE : chr NA NA NA NA ...
## $ REFERMEMO : num NA NA 25 33 NA NA NA NA NA NA ...
## $ PHOTOMEMO : num 13 NA 26 34 37 42 45 NA 50 57 ...
## - attr(*, "spec")=
## .. cols(
## .. PEAKID = col_character(),
## .. PKNAME = col_character(),
## .. PKNAME2 = col_character(),
## .. LOCATION = col_character(),
## .. HEIGHTM = col_double(),
## .. HEIGHTF = col_double(),
## .. HIMAL = col_double(),
## .. HIMAL_FACTOR = col_character(),
## .. REGION = col_double(),
## .. REGION_FACTOR = col_character(),
## .. OPEN = col_logical(),
## .. UNLISTED = col_logical(),
## .. TREKKING = col_logical(),
## .. TREKYEAR = col_double(),
## .. RESTRICT = col_character(),
## .. PHOST = col_double(),
## .. PHOST_FACTOR = col_character(),
## .. PSTATUS = col_double(),
## .. PSTATUS_FACTOR = col_character(),
## .. PEAKMEMO = col_double(),
## .. PYEAR = col_double(),
## .. PSEASON = col_double(),
## .. PEXPID = col_character(),
## .. PSMTDATE = col_character(),
## .. PCOUNTRY = col_character(),
## .. PSUMMITERS = col_character(),
## .. PSMTNOTE = col_character(),
## .. REFERMEMO = col_double(),
## .. PHOTOMEMO = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
## - attr(*, ".internal.selfref")=<externalptr>
# # Data Transformations
#
# Below, I use data.table syntax to filter and aggregate expedition data,
# then merge with the peaks data.
# ## Filtering with data.table
#
# I focus on expeditions that:
# - Have `TOTMEMBERS > 0`.
# - Did not terminate with the reason "Did not attempt climb".
# - Occurred in Spring or Autumn.
#+ data_filtering
exped_filtered <- exped_dt[
TERMREASON_FACTOR != "Did not attempt climb" &
TOTMEMBERS > 0 &
SEASON_FACTOR %in% c("Spring", "Autumn")
]
head(exped_filtered)
## EXPID PEAKID YEAR SEASON SEASON_FACTOR HOST HOST_FACTOR ROUTE1
## <char> <char> <num> <num> <char> <num> <char> <char>
## 1: EVER20102 EVER 2020 1 Spring 2 China N Col-NE Ridge
## 2: EVER20103 EVER 2020 1 Spring 2 China N Col-NE Ridge
## 3: AMAD20301 AMAD 2020 3 Autumn 1 Nepal SW Ridge
## 4: AMAD20302 AMAD 2020 3 Autumn 1 Nepal SW Ridge
## 5: AMAD20303 AMAD 2020 3 Autumn 1 Nepal SW Ridge
## 6: AMAD20304 AMAD 2020 3 Autumn 1 Nepal SW Ridge
## ROUTE2 ROUTE3 ROUTE4 NATION LEADERS
## <char> <lgcl> <lgcl> <char> <char>
## 1: <NA> NA NA China Ci Luo (Tselo)
## 2: <NA> NA NA China Tsering Samdrup
## 3: <NA> NA NA Nepal Chhang Dawa Sherpa
## 4: <NA> NA NA USA Garrett Madison
## 5: <NA> NA NA UK Jon Gupta
## 6: <NA> NA NA UK Kenton Cool, Tim Mosedale
## SPONSOR SUCCESS1 SUCCESS2 SUCCESS3
## <char> <lgcl> <lgcl> <lgcl>
## 1: Chinese Mount Everest Survey Team TRUE FALSE FALSE
## 2: Holy Mountain Adventure Everest Expedition 2020 TRUE FALSE FALSE
## 3: Seven Summit Treks Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 4: Madison Mountaineering Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 5: Himalayan Guides Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 6: Himalayan Guides Ama Dablam Expedition 2020 FALSE FALSE FALSE
## SUCCESS4 ASCENT1 ASCENT2 ASCENT3 ASCENT4 CLAIMED DISPUTED
## <lgcl> <char> <char> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: FALSE <NA> <NA> NA NA FALSE FALSE
## 2: FALSE <NA> <NA> NA NA FALSE FALSE
## 3: FALSE <NA> <NA> NA NA FALSE FALSE
## 4: FALSE <NA> <NA> NA NA FALSE FALSE
## 5: FALSE <NA> <NA> NA NA FALSE FALSE
## 6: FALSE <NA> <NA> NA NA FALSE FALSE
## COUNTRIES
## <char>
## 1: <NA>
## 2: <NA>
## 3: Canada, Czech Republic, France, Poland, Russia, Switzerland, Ukraine, USA
## 4: Canada, Qatar
## 5: <NA>
## 6: <NA>
## APPROACH BCDATE
## <char> <Date>
## 1: <NA> <NA>
## 2: Lhasa->Tingri->Everest BC 2020-04-23
## 3: <NA> 2020-11-09
## 4: Lukla->Pangboche->Ama Dablam BC 2020-11-01
## 5: Lukla->Namche->Dingboche->Chhukung Ri->Pangboche->Ama Dablam BC 2020-11-15
## 6: Lukla->Namche->Dingboche->Chhukung Ri->Pangboche->Ama Dablam BC 2020-11-16
## SMTDATE SMTTIME SMTDAYS TOTDAYS TERMDATE TERMREASON
## <Date> <char> <num> <num> <Date> <num>
## 1: 2020-05-27 0945 0 0 <NA> 1
## 2: 2020-05-28 0545 35 38 2020-05-31 1
## 3: 2020-11-10 1300 1 0 <NA> 1
## 4: 2020-11-10 1300 9 11 2020-11-12 1
## 5: 2020-12-01 1243 16 17 2020-12-02 1
## 6: 2020-11-27 <NA> 11 13 2020-11-29 4
## TERMREASON_FACTOR TERMNOTE
## <char> <char>
## 1: Success (main peak) <NA>
## 2: Success (main peak) <NA>
## 3: Success (main peak) <NA>
## 4: Success (main peak) <NA>
## 5: Success (main peak) <NA>
## 6: Bad weather (storms, high winds) Abandoned at 6650m due to high winds
## HIGHPOINT TRAVERSE SKI PARAPENTE CAMPS ROPE TOTMEMBERS SMTMEMBERS
## <num> <lgcl> <lgcl> <lgcl> <num> <num> <num> <num>
## 1: 8849 FALSE FALSE FALSE 3 0 12 8
## 2: 8849 FALSE FALSE FALSE 3 0 20 14
## 3: 6814 FALSE FALSE FALSE 2 0 14 9
## 4: 6814 FALSE FALSE FALSE 2 0 6 6
## 5: 6814 FALSE FALSE FALSE 2 0 2 2
## 6: 6650 FALSE FALSE FALSE 2 0 4 0
## MDEATHS TOTHIRED SMTHIRED HDEATHS NOHIRED O2USED O2NONE O2CLIMB O2DESCENT
## <num> <num> <num> <num> <lgcl> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: 0 0 0 0 FALSE TRUE FALSE TRUE FALSE
## 2: 0 22 21 0 FALSE TRUE FALSE TRUE FALSE
## 3: 0 19 14 0 FALSE FALSE TRUE FALSE FALSE
## 4: 0 8 8 0 FALSE FALSE TRUE FALSE FALSE
## 5: 0 1 1 0 FALSE FALSE TRUE FALSE FALSE
## 6: 0 2 0 0 FALSE FALSE TRUE FALSE FALSE
## O2SLEEP O2MEDICAL O2TAKEN O2UNKWN OTHERSMTS
## <lgcl> <lgcl> <lgcl> <lgcl> <char>
## 1: TRUE FALSE FALSE FALSE <NA>
## 2: TRUE FALSE FALSE FALSE <NA>
## 3: FALSE FALSE FALSE FALSE <NA>
## 4: FALSE FALSE FALSE FALSE <NA>
## 5: FALSE FALSE FALSE FALSE <NA>
## 6: FALSE FALSE FALSE FALSE <NA>
## CAMPSITES
## <char>
## 1: BC,ABC,C1,C2,C3,Smt(27/05)
## 2: BC(23/04,5200m),IC(26/04,5800m),ABC(05/01,6500m),C1(25/05,7028m),C2(26/05,7790m),C3(27/05,8300m),Smt(28/05)
## 3: BC(09/11,4450m),C1(5600m),C2(5900m),Smt(10,12-13,15/11)
## 4: BC(01/11,4700m),C1(04/11,5600m),C2(10/11,6000m),Smt(10-11/11)
## 5: BC(15/11,4400m),ABC(17/11,5350m),C1(18/11,5800m),C2(26/11,6000m),Smt(01/12)
## 6: BC(16/11,4500m),ABC(17/11/5400m),C1(19/12,5700m),C2(26/11,6000m),xxx(27/11,6650m)
## ROUTEMEMO ACCIDENTS ACHIEVMENT AGENCY COMRTE STDRTE PRIMRTE
## <num> <char> <char> <char> <lgcl> <lgcl> <lgcl>
## 1: 221011 <NA> <NA> <NA> TRUE TRUE FALSE
## 2: 203869 <NA> <NA> Holy Mountain Adventure TRUE TRUE FALSE
## 3: NA <NA> <NA> Seven Summit Treks TRUE FALSE FALSE
## 4: 29755 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## 5: 107752 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## 6: 29661 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## PRIMMEM PRIMREF PRIMID CHKSUM
## <lgcl> <lgcl> <char> <num>
## 1: FALSE FALSE <NA> 2465292
## 2: FALSE FALSE <NA> 2465293
## 3: FALSE FALSE <NA> 2463299
## 4: FALSE FALSE <NA> 2463299
## 5: FALSE FALSE <NA> 2463320
## 6: FALSE FALSE <NA> 2463316
# These filtering criteria help me zero in on meaningful climbing attempts.
# ## Aggregation with data.table
#
# Next, I compute:
# - The average number of members, `avg_members`.
# - The total number of expeditions, `n_exped`.
#
# Grouped by `YEAR` and `SEASON_FACTOR`.
#+ data_aggregation
exped_summary <- exped_filtered[,
.(
avg_members = mean(TOTMEMBERS, na.rm = TRUE),
n_exped = .N
),
by = .(YEAR, SEASON_FACTOR)
]
exped_summary
## YEAR SEASON_FACTOR avg_members n_exped
## <num> <char> <num> <int>
## 1: 2020 Spring 16.000000 2
## 2: 2020 Autumn 7.250000 12
## 3: 2021 Spring 7.161905 105
## 4: 2021 Autumn 7.505618 89
## 5: 2022 Spring 7.949153 118
## 6: 2022 Autumn 8.939597 149
## 7: 2023 Spring 8.145038 131
## 8: 2023 Autumn 9.267176 131
## 9: 2024 Spring 10.478723 94
# This summary reveals how many expeditions occur each year and season,
# as well as average expedition size.
# ## Merging Datasets
#
# I merge `exped_filtered` with `peaks_dt` by `PEAKID`. This adds extra columns
# (e.g., `HEIGHTM`) to the expedition data, enabling deeper analysis of
# expedition characteristics relative to peak attributes.
#+ data_merging
merged_dt <- merge(
x = exped_filtered,
y = peaks_dt,
by = "PEAKID",
all.x = TRUE
)
head(merged_dt)
## Key: <PEAKID>
## PEAKID EXPID YEAR SEASON SEASON_FACTOR HOST HOST_FACTOR ROUTE1
## <char> <char> <num> <num> <char> <num> <char> <char>
## 1: AMAD AMAD20301 2020 3 Autumn 1 Nepal SW Ridge
## 2: AMAD AMAD20302 2020 3 Autumn 1 Nepal SW Ridge
## 3: AMAD AMAD20303 2020 3 Autumn 1 Nepal SW Ridge
## 4: AMAD AMAD20304 2020 3 Autumn 1 Nepal SW Ridge
## 5: AMAD AMAD20305 2020 3 Autumn 1 Nepal SW Ridge
## 6: AMAD AMAD20307 2020 3 Autumn 1 Nepal SW Ridge
## ROUTE2 ROUTE3 ROUTE4 NATION LEADERS
## <char> <lgcl> <lgcl> <char> <char>
## 1: <NA> NA NA Nepal Chhang Dawa Sherpa
## 2: <NA> NA NA USA Garrett Madison
## 3: <NA> NA NA UK Jon Gupta
## 4: <NA> NA NA UK Kenton Cool, Tim Mosedale
## 5: <NA> NA NA Italy Manuel Villani
## 6: <NA> NA NA India Debasish Biswas
## SPONSOR SUCCESS1 SUCCESS2 SUCCESS3
## <char> <lgcl> <lgcl> <lgcl>
## 1: Seven Summit Treks Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 2: Madison Mountaineering Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 3: Himalayan Guides Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 4: Himalayan Guides Ama Dablam Expedition 2020 FALSE FALSE FALSE
## 5: Furtenbach Adventures Ama Dablam Expedition 2020 TRUE FALSE FALSE
## 6: Seven Summit Treks Ama Dablam Expedition 2020 TRUE FALSE FALSE
## SUCCESS4 ASCENT1 ASCENT2 ASCENT3 ASCENT4 CLAIMED DISPUTED
## <lgcl> <char> <char> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: FALSE <NA> <NA> NA NA FALSE FALSE
## 2: FALSE <NA> <NA> NA NA FALSE FALSE
## 3: FALSE <NA> <NA> NA NA FALSE FALSE
## 4: FALSE <NA> <NA> NA NA FALSE FALSE
## 5: FALSE <NA> <NA> NA NA FALSE FALSE
## 6: FALSE <NA> <NA> NA NA FALSE FALSE
## COUNTRIES
## <char>
## 1: Canada, Czech Republic, France, Poland, Russia, Switzerland, Ukraine, USA
## 2: Canada, Qatar
## 3: <NA>
## 4: <NA>
## 5: <NA>
## 6: Austria, UK
## APPROACH BCDATE
## <char> <Date>
## 1: <NA> 2020-11-09
## 2: Lukla->Pangboche->Ama Dablam BC 2020-11-01
## 3: Lukla->Namche->Dingboche->Chhukung Ri->Pangboche->Ama Dablam BC 2020-11-15
## 4: Lukla->Namche->Dingboche->Chhukung Ri->Pangboche->Ama Dablam BC 2020-11-16
## 5: Lukla->Pangboche->Ama Dablam BC 2020-11-26
## 6: Lukla->Pangboche->Ama Dablam BC 2020-11-20
## SMTDATE SMTTIME SMTDAYS TOTDAYS TERMDATE TERMREASON
## <Date> <char> <num> <num> <Date> <num>
## 1: 2020-11-10 1300 1 0 <NA> 1
## 2: 2020-11-10 1300 9 11 2020-11-12 1
## 3: 2020-12-01 1243 16 17 2020-12-02 1
## 4: 2020-11-27 <NA> 11 13 2020-11-29 4
## 5: 2020-11-29 0930 3 4 2020-11-30 1
## 6: 2020-11-30 0615 10 0 <NA> 1
## TERMREASON_FACTOR TERMNOTE
## <char> <char>
## 1: Success (main peak) <NA>
## 2: Success (main peak) <NA>
## 3: Success (main peak) <NA>
## 4: Bad weather (storms, high winds) Abandoned at 6650m due to high winds
## 5: Success (main peak) <NA>
## 6: Success (main peak) <NA>
## HIGHPOINT TRAVERSE SKI PARAPENTE CAMPS ROPE TOTMEMBERS SMTMEMBERS
## <num> <lgcl> <lgcl> <lgcl> <num> <num> <num> <num>
## 1: 6814 FALSE FALSE FALSE 2 0 14 9
## 2: 6814 FALSE FALSE FALSE 2 0 6 6
## 3: 6814 FALSE FALSE FALSE 2 0 2 2
## 4: 6650 FALSE FALSE FALSE 2 0 4 0
## 5: 6814 FALSE FALSE FALSE 2 0 1 1
## 6: 6814 FALSE FALSE FALSE 2 0 6 1
## MDEATHS TOTHIRED SMTHIRED HDEATHS NOHIRED O2USED O2NONE O2CLIMB O2DESCENT
## <num> <num> <num> <num> <lgcl> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: 0 19 14 0 FALSE FALSE TRUE FALSE FALSE
## 2: 0 8 8 0 FALSE FALSE TRUE FALSE FALSE
## 3: 0 1 1 0 FALSE FALSE TRUE FALSE FALSE
## 4: 0 2 0 0 FALSE FALSE TRUE FALSE FALSE
## 5: 0 1 1 0 FALSE FALSE TRUE FALSE FALSE
## 6: 0 6 3 0 FALSE FALSE TRUE FALSE FALSE
## O2SLEEP O2MEDICAL O2TAKEN O2UNKWN OTHERSMTS
## <lgcl> <lgcl> <lgcl> <lgcl> <char>
## 1: FALSE FALSE FALSE FALSE <NA>
## 2: FALSE FALSE FALSE FALSE <NA>
## 3: FALSE FALSE FALSE FALSE <NA>
## 4: FALSE FALSE FALSE FALSE <NA>
## 5: FALSE FALSE FALSE FALSE <NA>
## 6: FALSE FALSE FALSE FALSE <NA>
## CAMPSITES
## <char>
## 1: BC(09/11,4450m),C1(5600m),C2(5900m),Smt(10,12-13,15/11)
## 2: BC(01/11,4700m),C1(04/11,5600m),C2(10/11,6000m),Smt(10-11/11)
## 3: BC(15/11,4400m),ABC(17/11,5350m),C1(18/11,5800m),C2(26/11,6000m),Smt(01/12)
## 4: BC(16/11,4500m),ABC(17/11/5400m),C1(19/12,5700m),C2(26/11,6000m),xxx(27/11,6650m)
## 5: BC(26/11,4567m),C1(27/11,5700m),C2(28/11,5900m),Smt(29/11)
## 6: BC(20/11,4450m),C1(5600m),C2(28/11,5900m),Smt(30/11)
## ROUTEMEMO ACCIDENTS ACHIEVMENT AGENCY COMRTE STDRTE PRIMRTE
## <num> <char> <char> <char> <lgcl> <lgcl> <lgcl>
## 1: NA <NA> <NA> Seven Summit Treks TRUE FALSE FALSE
## 2: 29755 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## 3: 107752 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## 4: 29661 <NA> <NA> Himalayan Guides TRUE FALSE FALSE
## 5: 17154 <NA> <NA> Iceland Trekking TRUE FALSE FALSE
## 6: 69150 <NA> <NA> Seven Summit Treks TRUE FALSE FALSE
## PRIMMEM PRIMREF PRIMID CHKSUM PKNAME PKNAME2 LOCATION HEIGHTM
## <lgcl> <lgcl> <char> <num> <char> <char> <char> <num>
## 1: FALSE FALSE <NA> 2463299 Ama Dablam Amai Dablang Khumbu Himal 6814
## 2: FALSE FALSE <NA> 2463299 Ama Dablam Amai Dablang Khumbu Himal 6814
## 3: FALSE FALSE <NA> 2463320 Ama Dablam Amai Dablang Khumbu Himal 6814
## 4: FALSE FALSE <NA> 2463316 Ama Dablam Amai Dablang Khumbu Himal 6814
## 5: FALSE FALSE <NA> 2463318 Ama Dablam Amai Dablang Khumbu Himal 6814
## 6: FALSE FALSE <NA> 2463318 Ama Dablam Amai Dablang Khumbu Himal 6814
## HEIGHTF HIMAL HIMAL_FACTOR REGION REGION_FACTOR OPEN UNLISTED
## <num> <num> <char> <num> <char> <lgcl> <lgcl>
## 1: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## 2: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## 3: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## 4: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## 5: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## 6: 22356 12 Khumbu 2 Khumbu-Rolwaling-Makalu TRUE FALSE
## TREKKING TREKYEAR RESTRICT PHOST PHOST_FACTOR PSTATUS PSTATUS_FACTOR
## <lgcl> <num> <char> <num> <char> <num> <char>
## 1: FALSE NA <NA> 1 Nepal only 2 Climbed
## 2: FALSE NA <NA> 1 Nepal only 2 Climbed
## 3: FALSE NA <NA> 1 Nepal only 2 Climbed
## 4: FALSE NA <NA> 1 Nepal only 2 Climbed
## 5: FALSE NA <NA> 1 Nepal only 2 Climbed
## 6: FALSE NA <NA> 1 Nepal only 2 Climbed
## PEAKMEMO PYEAR PSEASON PEXPID PSMTDATE PCOUNTRY
## <num> <num> <num> <char> <char> <char>
## 1: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## 2: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## 3: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## 4: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## 5: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## 6: 8 1961 1 AMAD61101 Mar 13 New Zealand, USA, UK
## PSUMMITERS PSMTNOTE REFERMEMO
## <char> <char> <num>
## 1: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## 2: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## 3: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## 4: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## 5: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## 6: Mike Gill, Wally Romanes, Barry Bishop, Michael Ward <NA> NA
## PHOTOMEMO
## <num>
## 1: 13
## 2: 13
## 3: 13
## 4: 13
## 5: 13
## 6: 13
# The merged table, `merged_dt`, now contains both expedition-level and
# peak-level data for each relevant climb.
# # Data Visualizations
#
# I create seven plots with at least three distinct geoms**.
# Each plot features themes, titles, axes labels, and ColorBrewer
# palettes. One plot includes multiple geom layers.
# ## Plot 1: Bar Chart of Expedition Counts by Year and Season
#+ plot1_bar_chart
ggplot(exped_summary, aes(x = YEAR, y = n_exped, fill = SEASON_FACTOR)) +
geom_col(position = "dodge") +
scale_fill_brewer(palette = "Set1") +
labs(
title = "Expedition Counts by Year and Season",
subtitle = "Filtered to TOTMEMBERS > 0, excluding 'Did not attempt climb'",
x = "Year",
y = "Number of Expeditions",
fill = "Season"
) +
theme_minimal()

# Explanation: This shows the overall number of expeditions across
# different years, split between two primary climbing seasons.
# ## Plot 2: Boxplot of Peak Heights by Himalayan Range
#+ plot2_boxplot
ggplot(peaks_dt[HIMAL_FACTOR != "Unknown"], aes(x = HIMAL_FACTOR, y = HEIGHTM)) +
geom_boxplot(fill = "lightblue", outlier.shape = 16, outlier.alpha = 0.4) +
labs(
title = "Boxplot of Peak Heights by Himalayan Range",
x = "Himalayan Range",
y = "Height (m)"
) +
theme_bw() +
coord_flip()

# Explanation: I exclude unknown ranges and flip coordinates for easier
# reading. Some Himalayan ranges, such as Khumbu, reach extremely high altitudes.
# ## Plot 3: Scatter Plot + Smoothing (Multiple Geoms)
#
# I investigate how expedition size (`TOTMEMBERS`) correlates with peak height
# (`HEIGHTM`). Coloring by `SEASON_FACTOR` helps me distinguish patterns by season,
# and adding a smoothing line (dashed) provides a quick linear model estimate.
#+ plot3_scatter_smooth
ggplot(merged_dt, aes(x = HEIGHTM, y = TOTMEMBERS, color = SEASON_FACTOR)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed") +
scale_color_brewer(palette = "Dark2") +
labs(
title = "Expedition Size vs. Peak Height",
subtitle = "Scatter + Linear Model Smoothing",
x = "Peak Height (m)",
y = "Total Members",
color = "Season"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Explanation: Higher peaks may require more climbers, especially in Spring.
# ## Plot 4: Histogram of Expedition Total Days (Excluding 0)
#
# I remove expeditions with `TOTDAYS == 0` to avoid skew. This shows how many
# days expeditions typically last.
#+ plot4_histogram
ggplot(exped_filtered[TOTDAYS > 0], aes(x = TOTDAYS)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
labs(
title = "Distribution of Expedition Total Days (Excluding Zero)",
x = "Total Days",
y = "Frequency"
) +
theme_light()

# Explanation: Many expeditions finish in fewer than 40-45 days, though
# some extend much longer.
# ## Plot 5: Bar Chart of Termination Reasons
#
# I display how expeditions in `exped_filtered` were concluded, using
# `TERMREASON_FACTOR` on the x-axis.
#+ plot5_termination
ggplot(exped_filtered, aes(x = TERMREASON_FACTOR)) +
geom_bar(fill = "tomato") +
labs(
title = "Termination Reasons in Filtered Expeditions",
x = "Reason for Termination",
y = "Count of Expeditions"
) +
theme_classic() +
coord_flip()

# Explanation: Common reasons include success on the main peak and adverse weather.
# ## Plot 6: Faceted Scatter of Days to Summit vs. Expedition Size (Excluding 0)
#
# I compare the number of days to summit (`SMTDAYS`) to expedition size (`TOTMEMBERS`),
# excluding zero to avoid invalid records. Faceting by `SEASON_FACTOR` allows me
# to visualize differences between Spring and Autumn. Coloring by `CLAIMED`
# indicates whether success was claimed.
#+ plot6_faceted_scatter
ggplot(merged_dt[SMTDAYS > 0], aes(x = SMTDAYS, y = TOTMEMBERS, color = CLAIMED)) +
geom_point(alpha = 0.6) +
facet_wrap(~ SEASON_FACTOR) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Days to Summit vs. Expedition Size, by Season (Excluding Zero)",
x = "Days to Summit",
y = "Total Members",
color = "Success Claimed?"
) +
theme_minimal()

# Explanation: There is wide variation in how many days it takes to summit,
# and success does not always correlate with fewer summit-day counts.
# ## Plot 7: Boxplot + Jitter of Hired Personnel by Host Country
#
# I compare total hired personnel (`TOTHIRED`) across different host countries.
# Boxplots show overall distribution, and jittered points show individual data.
#+ plot7_box_jitter
ggplot(merged_dt, aes(x = HOST_FACTOR, y = TOTHIRED, fill = HOST_FACTOR)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(position = position_jitter(width = 0.2), alpha = 0.4, color = "black") +
scale_fill_brewer(palette = "Pastel1") +
labs(
title = "Hired Personnel by Host Country",
x = "Host Country",
y = "Number of Hired Personnel",
fill = "Host"
) +
theme_bw()

# Explanation: Nepal-hosted expeditions may employ larger groups of hired
# personnel. However, some expeditions in other host countries also require
# notable support teams.
# # Commentary and Conclusions
#
# Filtering and Aggregation:
# I narrowed expeditions down to those with meaningful attempts during the
# two main climbing seasons. Aggregating by year and season provided a high-level
# overview of average expedition sizes.
#
# Merging:
# By merging `exped_filtered` and `peaks_dt`, I gained insights into how
# expedition size or total days might relate to peak characteristics like
# height or location.
#
# Key Observations:
# - Spring commonly hosts more expeditions than Autumn.
# - Some Himalayan ranges contain peaks well above 8000 meters.
# - Higher peaks can be associated with larger team sizes, particularly in Spring.
# - Expeditions often last under 45 days, though certain groups stay much longer.
# - Success on the main peak and poor weather dominate termination reasons.
# - Summit-day ranges and success claims vary significantly by season.
# - Differences in hired personnel usage can be partially attributed to
# host country factors.
#
# Conclusion:
# In my analysis I explored key trends in Nepalese mountaineering for years 2020–2024.
# Further research could examine success rates by specific routes, nationalities,
# or equipment usage for a more detailed picture.