SUMMARY: During this self directed study I have learned a few things baout plotly, that I had no idea existed before, and was able to improve on a plot I had previously generated with ggplot. Here is a list of stuff I have learned:
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(knitr)
The below plots are me exploring my data before doing the independent study of plotly
APADATA <- read_csv("APADATA.csv")
## Rows: 83 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): ICSR.Avg, DQ_7_GPA, GQ_19_SubSuccess, PSS.Avg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(APADATA, 10)
## # A tibble: 10 × 4
## ICSR.Avg DQ_7_GPA GQ_19_SubSuccess PSS.Avg
## <dbl> <dbl> <dbl> <dbl>
## 1 4.24 3.65 6 26
## 2 3.76 3.43 3 18
## 3 3.53 3.8 5 18
## 4 4.47 3.2 4 31
## 5 3.18 3.97 4 19
## 6 4.24 3.3 7 16
## 7 4.24 2.76 3 17
## 8 3.71 3.55 5 30
## 9 4.29 3.95 6 10
## 10 3.12 2.8 3 24
#Renaming scale data to more easily understandable names: resilience, subjective success, GPA, and perceived stress
APArenamed <- APADATA |>
rename(Resilience = ICSR.Avg, GPA = DQ_7_GPA, Subjective_Success = GQ_19_SubSuccess, Stress = PSS.Avg)
#Checking variable names are correct
print(colnames(APArenamed))
## [1] "Resilience" "GPA" "Subjective_Success"
## [4] "Stress"
#Filter out NA values from the GPA column and -888 values from GPA AND Subjective Success Columns
APAfilter <- APArenamed |>
na.omit()|>
filter(!(GPA %in% c(-888)))|>
filter(!(Subjective_Success %in% c(-888)))
head(APAfilter,15)
## # A tibble: 15 × 4
## Resilience GPA Subjective_Success Stress
## <dbl> <dbl> <dbl> <dbl>
## 1 4.24 3.65 6 26
## 2 3.76 3.43 3 18
## 3 3.53 3.8 5 18
## 4 4.47 3.2 4 31
## 5 3.18 3.97 4 19
## 6 4.24 3.3 7 16
## 7 4.24 2.76 3 17
## 8 3.71 3.55 5 30
## 9 4.29 3.95 6 10
## 10 3.12 2.8 3 24
## 11 3.88 3.57 5 24
## 12 3.76 3.45 5 19
## 13 3.65 3.95 5 24
## 14 3.18 3 4 20
## 15 4.06 3.8 4 16
We will only be creating graphs for the multiple regression model for the impact of resilience and perceived stress on subjective success since there was no significant relationship in the model with objective success.
#creates scatter plots to compare Resilience to Subjective success
ggplot(data = APAfilter,
mapping = aes(x = Resilience, y = Subjective_Success))+
geom_point()+
geom_smooth(method = "lm",
col = "lightblue",
se = FALSE)+
labs(title = "Comparing Resilience and Subjective Success",
x = "Resilience",
y = "Subjective Success")
## `geom_smooth()` using formula = 'y ~ x'
#creates scatter plots to compare Perceived stress to Subjective success
ggplot(data = APAfilter,
mapping = aes(x = Stress, y = Subjective_Success))+
geom_point()+
geom_smooth(method = "lm",
col = "pink",
se=FALSE)+
labs(title = "Comparing Perceived and Subjective Success",
x = "Stress",
y = "Subjective Success")
## `geom_smooth()` using formula = 'y ~ x'
#creates scatter plots to compare Resilience to Stress
ggplot(data = APAfilter,
mapping = aes(x = Resilience, y = Stress))+
geom_point()+
geom_smooth(method = "lm",
col = "lightgreen",
se = FALSE)+
labs(title = "Comparing Resilience and Subjective Success",
x = "Resilience",
y = "Stress")
## `geom_smooth()` using formula = 'y ~ x'
#Scatter plot showing the effect of both Perceived stress and Resilience on Subjective Success
#Stress and resilience are inversely related, as shown by the graph above, so this graph *SHOULD* show that the people with the lowest stress, will have the most resilience, and therefore score higher on Subjective Success I have also changed the theme to dark because the yellow from the plasma color set does not look good on a white background
ggplot(data = APAfilter,
mapping = aes(x = Stress, y = Subjective_Success, color = Resilience))+
geom_point()+
stat_smooth(method = "lm",
se = FALSE,
color = "lightgrey")+
scale_color_viridis_c(option = "plasma")+
labs( title = "High Resilience & Low Stress Are Related to Greater Subjective Success",
x = "Stress",
y = "Subjective Success")+
theme_dark()
## `geom_smooth()` using formula = 'y ~ x'
#EXAMPLE FROM PLOTLY. This is an example of some code provided by plotly for how to create various heatmap plots. I have commented to show my understanding of each:
fig1.1 <- plot_ly(z = volcano, type = "heatmap")
fig1.1
m <- matrix(rnorm(9), nrow = 3, ncol = 3) #allows you to choose the total number of boxes on your heatmap, making it as detailed or undetailed as you like. Adding more columns and rows will increase that level of detail by giving the colors more space to map to.
fig1.2 <- plot_ly(
x = c("a", "b", "c"), y = c("d", "e", "f"),
z = m, type = "heatmap"
)
fig1.2
fig1.3 <- plot_ly(z = volcano, colors = "Greys", type = "heatmap") #colors will change the scale that the heatmap made with, presumably you would be able to use any of the viridis scale colors, I know I'm partial to the plasma ones since it is very easy to discern.
fig1.3
fig1.4 <- plot_ly(z = volcano, colors = colorRamp(c("red", "green")), type = "heatmap") #this also shows that the color scales can be changed specifically by choosing two different color names that the program will then blend between. However, for this example red and green are poor chocies due to the difficulty with red and green ofr colorblind peoole.
fig1.4
vals <- unique(scales::rescale(c(volcano))) #this changes the dataset to use only the "unique" values from a dataset and filtering out uninteresting or filler data
o <- order(vals, decreasing = FALSE) #data is ordered in descending order
cols <- scales::col_numeric("Blues", domain = NULL)(vals) #will assign each unique value to each color value in the scale pallet
colz <- setNames(data.frame(vals[o], cols[o]), NULL) #removes column names
fig1.5 <- plot_ly(z = volcano, colorscale = colz, type = "heatmap") #creates the heat map with the dataset with the specific color scale specified by colz, cols, and o, and vals.
fig1.5
#creating a heat map to display the data related to Stress, Resilience, and Subjective Success from APA Data
matrix_resilience <- matrix(APAfilter$Resilience, nrow = sqrt(length(APAfilter$Resilience)), byrow = TRUE)
Heatmap1 <- plot_ly(z = matrix_resilience, type = "heatmap") %>%
layout(
title = "High Resilience & Low Stress Are Related to Greater Subjective Success",
xaxis = list(title = "Stress"),
yaxis = list(title = "Subjective Success"))
Heatmap1
#size and colormapping examples from plotly
d <- diamonds[sample(nrow(diamonds), 1000), ]
fig2.1 <- plot_ly(
d, x = ~carat, y = ~price,
color = ~carat, size = ~carat #in this graph size, color, and the x axis are all showing the same variable, however this could be really beneficial for showing more than 2 variables on a scatter plot, so larger dots and brighter colored dots can show the strength of a 3rd or 4th variable
)
fig2.1
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.
fig2.2 <- plot_ly(
d, x = ~carat, y = ~price,
# Hover text: this makes it easier to understand what each individual datapoint means rather than just having an arbitrary scale number, this is helpful for this kind of data since each point will easiky correspond to each diamond- also price and carat are easily understood by most people, unlike my data where people have no idea what it means to score a 24 on the perceived stress scale.
text = ~paste("Price: ", price, '$<br>Cut:', cut), #descriptive labels aid in accessibility an understandability of a graph to an uninformed audience
color = ~carat, size = ~carat
)
fig2.2
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.
#Color and size mapping in plotly WITH APA DATA. Should generate a plot that looks similar to the scatterplot of high stress and low resilience relating to subjective success BUT resilience will be demonstrated by both color AND point Size!
size_color <- plot_ly(
APAfilter,x = ~Stress,y =~Subjective_Success,color = ~Resilience, size = ~Resilience)|>
layout(
title = "High Resilience & Low Stress Are Related to Greater Subjective Success",
xaxis = list(title = "Stress"),
yaxis = list(title = "Subjective Success"))
size_color
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.
#multiple linear regression modeling ideas from plotly- i will not be doing a 3d plot for the final project with my APA data I just want to play with it with a solid example :)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.8 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.1 ✔ yardstick 1.3.2
## ✔ recipes 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(kernlab) #helps fit models to predict the outcomes of continuous input variables
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:dials':
##
## buffer
##
## The following object is masked from 'package:scales':
##
## alpha
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(pracma) #For creating a meshgrid that helps build the "regression blanket (i dont know what the proper term for that is... but it looks like a blanket)
##
## Attaching package: 'pracma'
##
## The following objects are masked from 'package:kernlab':
##
## cross, eig, size
##
## The following object is masked from 'package:purrr':
##
## cross
data(iris)
mesh_size <- .02
margin <- 0
X <- iris |> select(Sepal.Width, Sepal.Length)
y <- iris |> select(Petal.Width)
model <- svm_rbf(cost = 1.0) |>
set_engine("kernlab")|>
set_mode("regression")|>
fit(Petal.Width ~ Sepal.Width + Sepal.Length, data = iris) #allows the model to use the Sepal width and length to predict the petal width
x_min <- min(X$Sepal.Width) - margin
x_max <- max(X$Sepal.Width) - margin
y_min <- min(X$Sepal.Length) - margin
y_max <- max(X$Sepal.Length) - margin
xrange <- seq(x_min, x_max, mesh_size)
yrange <- seq(y_min, y_max, mesh_size) #helps define the size of the grid we will use to make predictions, this is set as a range because we are looking at continuous values
xy <- meshgrid(x = xrange, y = yrange)
xx <- xy$X
yy <- xy$Y
dim_val <- dim(xx)
xx1 <- matrix(xx, length(xx), 1)
yy1 <- matrix(yy, length(yy), 1)
final <- cbind(xx1, yy1)
pred <- model |>
predict(final) #will allow us to predict the pedal width at every gridpoint, regardless of if we have data there or not since it is being fit to the matrix made up earlier by our predictors (sepal width and length)
pred <- pred$.pred
pred <- matrix(pred, dim_val[1], dim_val[2])
fig3.1 <- plot_ly(iris, x = ~Sepal.Width, y = ~Sepal.Length, z = ~Petal.Width ) |>
add_markers(size = 5) |>
add_surface(x=xrange, y=yrange, z=pred, alpha = 0.65, type = 'mesh3d', name = 'pred_surface') #will smooth out how the blanket looks rather than it being gridlike
fig3.1
#3d plot with APA data, again I wont be using this at all in the final, I just wanted to see what it would look like as a 3d scatter plot
plot_ly(APAfilter, x = ~Resilience, y = ~Stress, z = ~Subjective_Success,
type = "scatter3d",
mode = "markers",
marker = list(size=3,
color = ~Resilience,
colorscale = "Plasma")) %>%
layout(scene = list(
xaxis = list(title = "Resilience"),
yaxis = list(title = "Stress"),
zaxis = list(title = "Subjective Success")
))
#I tried for like 3 hours to get the prediction blanket on this plot but for my sanity and safety of my computer I will just leave this as a 3D scatter plot