surf_data <- read.csv("surf_data.csv")
library("wesanderson")
class(surf_data$LocationLatitude)
## [1] "character"
surf_data$LocationLatitude <- as.numeric(surf_data$LocationLatitude)
## Warning: NAs introduced by coercion
surf_data$LocationLongitude <- as.numeric(surf_data$LocationLongitude)
## Warning: NAs introduced by coercion
library(leaflet)
leaflet(data = surf_data) %>%
addTiles() %>%
addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude)
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored
surf_data$Experience <- as.factor(surf_data$Experience)
pal <- colorFactor("magma", levels = surf_data$Experience)
## Warning in colorFactor("magma", levels = surf_data$Experience): Duplicate
## levels detected
library(leaflet)
leaflet(data = surf_data) %>%
addTiles() %>%
addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude, color = ~pal(Experience)) %>%
addLegend(data = surf_data,
pal = pal,
values = ~Experience,
title = "Experience Level",
position = "bottomright")
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored
surf_data$Q2_1 <- as.numeric(surf_data$Q2_1)
## Warning: NAs introduced by coercion
pal <- colorNumeric("magma", domain = surf_data$Q2_1)
library(leaflet)
leaflet(data = surf_data) %>%
addTiles() %>%
addCircleMarkers(lng = surf_data$LocationLongitude, lat = surf_data$LocationLatitude, color = ~pal(Q2_1)) %>%
addLegend(data = surf_data,
pal = pal,
values = ~Q2_1,
title = "Experience Level",
position = "bottomright")
## Warning in validateCoords(lng, lat, funcName): Data contains 6 rows with either
## missing or invalid lat/lon values and will be ignored
#colnames(surf_data) <- as.character(surf_data[1, ])
#surf_data <- surf_data[-1, ]
# This is a very jenky fix, but I'll do it for now...
library(corrplot)
## corrplot 0.92 loaded
# coerce all cols to be numeric
numeric_surf_data <- as.data.frame(lapply(surf_data, function(x) {
suppressWarnings(as.numeric(as.character(x)))
}))
# remove all NA cols (don't include in corrplot)
numeric_surf_data <- numeric_surf_data[, colSums(!is.na(numeric_surf_data)) > 0]
numeric_surf_data <- numeric_surf_data[, 5:16]
cor_matrix <- cor(numeric_surf_data, use = "pairwise.complete.obs")
## Warning in cor(numeric_surf_data, use = "pairwise.complete.obs"): the standard
## deviation is zero
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7, number.cex = 0.5)
Quick analysis: - surfboard science understanding correlated together - Satisfaction with surfboard design (Q1_1) correlated slightly with perceived understanding of science, and strongly with (Q2_1 –what’s that?)
Basically, I need to rename all of the column titles, but that’s going to be annoying – so I’ll do it later
–> decide which ones to include/not include –> throw out correlations that aren’t significant –> can rank row or column to rank by p values or R squared, etc.
library(tm)
## Loading required package: NLP
library(wordcloud2)
# Q4 is Example of a Specific Scientific Advancement in Board Design
text_column <- surf_data$Q4
# From column, combine into one mega string
text_combined <- paste(text_column, collapse = " ")
corpus <- Corpus(VectorSource(text_combined))
# do cleaning, remove stopwords
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, removeWords, c(
"example"
))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("example")):
## transformation drops documents
tdm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(tdm)
word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
# word_freqs
# use wordcloud2
science_df <- data.frame(word = names(word_freqs), freq = word_freqs)
wordcloud2(science_df, size = 0.8)
Wordclouds are bad (truism) –> co occurence network?, word trees
Useful website to steal from?: https://slcladal.netlify.app/net.html Essentially, should be more quantitative than a simple word cloud
Co-occurence, topic modeling (LDA),
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(widyr)
library(tidyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ggplot2)
# TODO: Note that I have the question also being encoded in here, which is silly (need to just rename those darn columns...) Doesn't matter though, because it's filtered out (n>=2)
# grab literally every word (except in stopwords)
tokens <- surf_data %>%
select(ResponseId, Q4) %>%
unnest_tokens(word, Q4) %>% # this shifts every response column from response id, sentence -> response id, word1, response id, word2, etc.
filter(!word %in% stop_words$word)
# co occurence (number of times words appear together with the SAME response id)
word_pairs <- tokens %>%
pairwise_count(word, ResponseId, sort = TRUE, upper = FALSE)
# filter (otherwise it's a pretty stupid looking graph)
strong_pairs <- word_pairs %>%
filter(n >= 2)
# I haven't done this before but sure
graph <- strong_pairs %>%
graph_from_data_frame()
# Plot the network
ggraph(graph, layout = "fr") +
geom_edge_link(aes(width = n), alpha = 0.95, color = "gray") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE, size = 4) +
theme_void() +
ggtitle("Specific Scientific Advancements in Board Design (Q4) Co-Occurrence")
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
This graph is simultaneously very cool and also a little silly. It would make a bit more sense if we had many more responses to the survey. As is, the primary issue is that we’re looking at n>=2, which just isn’t that high (and if we move that to n>=3, it’s not dense enough to be meaningful).
Questions for second wave:
Demographic data – age (numeric) – years experience in industry (Why is this on a scale of 1, 2, 3????) – years experience surfing – Gender – Have you ever used simulation software (CAD?) in shaping or design – along this vein, some less ‘subjective’ questions about scientific understanding in board design –what stops you from experimenting more with design (order a list) –> cost, data, etc. – how many boards do you shape/sell a year <— not sure if this is the exact right question, but something numeric along these lines to get a sense of scale –rank these variables in terms of design impact: whatever the things were that we asked about earlier
https://github.com/karthik/wesanderson – Life Aquatic