Surf Data Wave 2

surf_data2 <- read.csv("surf_data2.csv")
library(leaflet)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(viridis)

## Loading required package: viridisLite

# Make magma the default for discrete scales
scale_colour_discrete <- function(...) scale_color_viridis_d(option = "magma", ...)
scale_fill_discrete   <- function(...) scale_fill_viridis_d(option = "magma", ...)

# Renamed using python preprocessing

#surf_data2 <- surf_data2 %>%
#  rename(
#    Participation_Match = Q1a,
#    Role = Q1b,
#    Surf_Years = Q2,
#    Experience = Q3,
#  )

surf_data2 <- surf_data2 %>%
  rename(
    Years_Surfing = Skill,
    LocationLatitude = Location.Latitude,
    LocationLongitude = Location.Longitude
  )

library(stringr)

# regex anatomy: \\ is an escape character, \d grabs digit, + grabs however many digits there are
surf_data2$Experience <- as.numeric(stringr::str_extract(surf_data2$Experience, "\\d+")) # extract just numeric answers from data
surf_data2$Years_Surfing <- as.numeric(stringr::str_extract(surf_data2$Years_Surfing, "\\d+"))
surf_data2$AvgBoardCost <- as.numeric(stringr::str_extract(surf_data2$AvgBoardCost, "\\d+"))
surf_data2$NumBoardsTotal <- as.numeric(stringr::str_extract(surf_data2$NumBoardsTotal, "\\d+"))
surf_data2$NumBoardsYearly <- as.numeric(stringr::str_extract(surf_data2$NumBoardsYearly, "\\d+"))



surf_data2$LocationLatitude <- as.numeric(surf_data2$LocationLatitude)

## Warning: NAs introduced by coercion

surf_data2$LocationLongitude <- as.numeric(surf_data2$LocationLongitude)

## Warning: NAs introduced by coercion

surf_data2$Duration..in.seconds. <- as.numeric(surf_data2$Duration..in.seconds.)

## Warning: NAs introduced by coercion

surf_data2$AvgBoardCost <- as.numeric(surf_data2$AvgBoardCost)
surf_data2$NumBoardsTotal <- as.numeric(surf_data2$NumBoardsTotal)
surf_data2$NumBoardsYearly <- as.numeric(surf_data2$NumBoardsYearly)

surf_data2$log_AvgBoardCost <- log(surf_data2$AvgBoardCost)
surf_data2$log_NumBoardsTotal <- log(surf_data2$NumBoardsTotal + 1)

head(surf_data2$log_NumBoardsTotal)

## [1] 1.609438 1.609438 0.000000 3.135494 1.609438       NA

library(leaflet)
library(viridisLite)

make_leaflet_magma <- function(data, value_col, 
                               palette_range = c(0.0, 0.5),
                               title = NULL,
                               lat_col = "LocationLatitude",
                               lng_col = "LocationLongitude") {
  
  # does this tilda thing: ~
  val <- rlang::sym(value_col)
  lat <- rlang::sym(lat_col)
  lng <- rlang::sym(lng_col)
  
  pal <- colorNumeric(
    palette = viridisLite::magma(256, begin = palette_range[1], end = palette_range[2]),
    domain = data[[value_col]]
  )
  
  # Default title if none provided
  if (is.null(title)) title <- value_col
  
  leaflet(data = data) %>%
    addTiles() %>%
    addCircleMarkers(
      lng = rlang::new_formula(NULL, lng),
      lat = rlang::new_formula(NULL, lat),
      color = rlang::new_formula(NULL, call("pal", val)),
      fillOpacity = 0.8,
      radius = 5
    ) %>%
    addLegend(
      position = "topright",
      pal = pal,
      values = rlang::new_formula(NULL, val),
      title = title,
      opacity = 1
    )
}

make_leaflet_magma(
  data = surf_data2,
  value_col = "Years_Surfing",
  palette_range = c(0.0, 0.5),
  title = "Surfing Experience (years)"
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

make_leaflet_magma(
  data = surf_data2,
  value_col = "Experience",
  palette_range = c(0.0, 0.5),
  title = "Shaping Experience (years)"
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

make_leaflet_magma(
  data = surf_data2,
  value_col = "log_AvgBoardCost", #AvgBoardCost is non-log
  palette_range = c(0.0, 0.5),
  title = "Log Avg Board Cost ($)"
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

make_leaflet_magma(
  data = surf_data2,
  value_col = "log_NumBoardsTotal",
  palette_range = c(0.0, 0.5),
  title = "Log Num Boards Total"
)

## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with either
## missing or invalid lat/lon values and will be ignored

library(corrplot)

## corrplot 0.92 loaded

# coerce all cols to be numeric
numeric_surf_data2 <- as.data.frame(lapply(surf_data2, function(x) {
  suppressWarnings(as.numeric(as.character(x)))
}))

# remove all NA cols (don't include in corrplot)
numeric_surf_data2 <- numeric_surf_data2[, colSums(!is.na(numeric_surf_data2)) > 0]
numeric_surf_data2 <- numeric_surf_data2[, 5:16]

cor_matrix <- cor(numeric_surf_data2, use = "pairwise.complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", tl.cex=.7, number.cex = 0.5)

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud2)

# Q4 is Example of a Specific Scientific Advancement in Board Design
text_column <- surf_data2$Surfing.Achievement

# From column, combine into one mega string
text_combined <- paste(text_column, collapse = " ")
corpus <- Corpus(VectorSource(text_combined))

# do cleaning, remove stopwords
corpus <- tm_map(corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus <- tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

corpus <- tm_map(corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

corpus <- tm_map(corpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

corpus <- tm_map(corpus, removeWords, c(
  "example"
))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("example")):
## transformation drops documents

tdm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(tdm)

word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
# word_freqs

# use wordcloud2
wordcloud_df <- data.frame(word = names(word_freqs), freq = word_freqs)
wordcloud2(wordcloud_df, size = 0.8, color = viridis(nrow(wordcloud_df), begin = 0.3, end = 1), shape = 'circle')

#  color options: https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/

library(tidytext)
library(widyr)
library(tidyr)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)


# grab literally every word (except in stopwords) 
tokens <- surf_data2 %>%
  select(Response.ID, Surfing.Achievement) %>%
  unnest_tokens(word, Surfing.Achievement) %>% # this shifts every response column from response id, sentence -> response id, word1, response id, word2, etc.
  filter(!word %in% stop_words$word)

# co occurence (number of times words appear together with the SAME response id)
word_pairs <- tokens %>%
  pairwise_count(word, Response.ID, sort = TRUE, upper = FALSE)

# filter (otherwise it's a pretty stupid looking graph)
strong_pairs <- word_pairs 
# %>% filter(n >= 2)

# I haven't done this before but sure
graph <- strong_pairs %>%
  graph_from_data_frame()

# Plot the network
ggraph(graph, layout = "fr") +
  geom_edge_link(aes(width = n), alpha = 0.95, color = "gray") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, size = 4) +
  theme_void() +
  ggtitle("Greatest Achievement Co-Occurrence")

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 96 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

This isn’t helpful – too much text data variability here.

ggplot(surf_data2, aes(x = Role_Single, y = Experience, fill = Role_Single)) +
  geom_boxplot(width = 0.6, outlier.shape = 21, color = "gray25") +
  scale_fill_viridis_d(option = "magma", begin = 0, end = 0.6, direction = -1) +
  labs(
    title = "Years Shaping by Industry Role",
    subtitle = "Boxplots",
    x = NULL,
    y = "Experience Shaping (Years)"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold"),
    legend.position = "top",
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# TODO: Note, I hand coded a lot of these to the best of my ability – should discuss optimal hand coding practices (Shaper takes precedent, etc.)

ggplot(surf_data2, aes(x = Role_Single, y = Years_Surfing, fill = Role_Single)) +
  geom_boxplot(width = 0.6, outlier.shape = 21, color = "gray25") +
  scale_fill_viridis_d(option = "magma", begin = 0, end = 0.6, direction = -1) +
  labs(
    title = "Years Surfing by Industry Role",
    subtitle = "Boxplots",
    x = NULL,
    y = "Experience Surfing (Years)"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold"),
    legend.position = "top",
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank()
  )

## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(surf_data2, aes(y = NumBoardsTotal)) +
  geom_boxplot(fill = viridisLite::magma(1, begin = 0.4, end = 1)) +
  labs(
    title = "Distribution of Total Boards Shaped",
    y = "Total Boards",
    x = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold")
  )

## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(surf_data2, aes(y = log(NumBoardsTotal))) +
  geom_boxplot(fill = viridisLite::magma(1, begin = 0.4, end = 1)) +
  labs(
    title = "Log Distribution of Total Boards Shaped",
    y = "Total Boards (Log Transformed)",
    x = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold")
  )

## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(surf_data2, aes(y = NumBoardsYearly)) +
  geom_boxplot(fill = viridisLite::magma(1, begin = 0.4, end = 1)) +
  labs(
    title = "Distribution of Num Boards Yearly",
    y = "Num Boards Yearly",
    x = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold")
  )

## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Surf Data Wave 2

Colden Johnson

2025-10-05