Import Data

surf_data <- read.csv("surf_data.csv")
library("wesanderson")

Map

class(surf_data$LocationLatitude)
## [1] "character"
surf_data$LocationLatitude <- as.numeric(surf_data$LocationLatitude)
## Warning: NAs introduced by coercion
surf_data$LocationLongitude <- as.numeric(surf_data$LocationLongitude)
## Warning: NAs introduced by coercion
library(leaflet)
leaflet(data = surf_data) %>%
  addTiles() %>%
  addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude)
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored

Map Experience as Factor

surf_data$Experience <- as.factor(surf_data$Experience)

pal <- colorFactor("magma", levels = surf_data$Experience)
## Warning in colorFactor("magma", levels = surf_data$Experience): Duplicate
## levels detected
library(leaflet)
leaflet(data = surf_data) %>%
  addTiles() %>%
  addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude, color = ~pal(Experience)) %>%
  addLegend(data = surf_data,
             pal = pal, 
             values = ~Experience, 
             title = "Experience Level",
             position = "bottomright")
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored

Map Science

surf_data$Q2_1 <- as.numeric(surf_data$Q2_1)
## Warning: NAs introduced by coercion
pal <- colorNumeric("magma", domain = surf_data$Q2_1)


library(leaflet)
leaflet(data = surf_data) %>%
  addTiles() %>%
  addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude, color = ~pal(Q2_1)) %>%
  addLegend(data = surf_data,
             pal = pal, 
             values = ~Q2_1, 
             title = "Experience Level",
             position = "bottomright")
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored

Set row to Title, Title Each

#colnames(surf_data) <- as.character(surf_data[1, ])
#surf_data <- surf_data[-1, ]

# This is a very jenky fix, but I'll do it for now...
library(corrplot)
## corrplot 0.92 loaded
# coerce all cols to be numeric
numeric_surf_data <- as.data.frame(lapply(surf_data, function(x) {
  suppressWarnings(as.numeric(as.character(x)))
}))

# remove all NA cols (don't include in corrplot)
numeric_surf_data <- numeric_surf_data[, colSums(!is.na(numeric_surf_data)) > 0]
numeric_surf_data <- numeric_surf_data[, 5:16]

cor_matrix <- cor(numeric_surf_data, use = "pairwise.complete.obs")
## Warning in cor(numeric_surf_data, use = "pairwise.complete.obs"): the standard
## deviation is zero
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7, number.cex = 0.5)

Quick analysis: - surfboard science understanding correlated together - Satisfaction with surfboard design (Q1_1) correlated slightly with perceived understanding of science, and strongly with (Q2_1 –what’s that?)

Basically, I need to rename all of the column titles, but that’s going to be annoying – so I’ll do it later

–> decide which ones to include/not include –> throw out correlations that aren’t significant –> can rank row or column to rank by p values or R squared, etc.

Example Wordcloud

library(tm)
## Loading required package: NLP
library(wordcloud2)

# Q4 is Example of a Specific Scientific Advancement in Board Design
text_column <- surf_data$Q4

# From column, combine into one mega string
text_combined <- paste(text_column, collapse = " ")
corpus <- Corpus(VectorSource(text_combined))

# do cleaning, remove stopwords
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, removeWords, c(
  "example"
))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("example")):
## transformation drops documents
tdm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(tdm)

word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
# word_freqs

# use wordcloud2
science_df <- data.frame(word = names(word_freqs), freq = word_freqs)
wordcloud2(science_df, size = 0.8)

Wordclouds are bad (truism) –> co occurence network?, word trees

Useful website to steal from?: https://slcladal.netlify.app/net.html Essentially, should be more quantitative than a simple word cloud

Co-occurence, topic modeling (LDA),

library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(widyr)
library(tidyr)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(ggplot2)

# TODO: Note that I have the question also being encoded in here, which is silly (need to just rename those darn columns...) Doesn't matter though, because it's filtered out (n>=2)


# grab literally every word (except in stopwords) 
tokens <- surf_data %>%
  select(ResponseId, Q4) %>%
  unnest_tokens(word, Q4) %>% # this shifts every response column from response id, sentence -> response id, word1, response id, word2, etc.
  filter(!word %in% stop_words$word)

# co occurence (number of times words appear together with the SAME response id)
word_pairs <- tokens %>%
  pairwise_count(word, ResponseId, sort = TRUE, upper = FALSE)

# filter (otherwise it's a pretty stupid looking graph)
strong_pairs <- word_pairs %>%
  filter(n >= 2)

# I haven't done this before but sure
graph <- strong_pairs %>%
  graph_from_data_frame()

# Plot the network
ggraph(graph, layout = "fr") +
  geom_edge_link(aes(width = n), alpha = 0.95, color = "gray") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, size = 4) +
  theme_void() +
  ggtitle("Specific Scientific Advancements in Board Design (Q4) Co-Occurrence")
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

This graph is simultaneously very cool and also a little silly. It would make a bit more sense if we had many more responses to the survey. As is, the primary issue is that we’re looking at n>=2, which just isn’t that high (and if we move that to n>=3, it’s not dense enough to be meaningful).

Questions for second wave:

Demographic data – age (numeric) – years experience in industry (Why is this on a scale of 1, 2, 3????) – years experience surfing – Gender – Have you ever used simulation software (CAD?) in shaping or design – along this vein, some less ‘subjective’ questions about scientific understanding in board design –what stops you from experimenting more with design (order a list) –> cost, data, etc. – how many boards do you shape/sell a year <— not sure if this is the exact right question, but something numeric along these lines to get a sense of scale –rank these variables in terms of design impact: whatever the things were that we asked about earlier

https://github.com/karthik/wesanderson – Life Aquatic