library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/ethandunn/Documents/University/2025/Semester 2/Psychology Capstone/Assignements/Practice data analysis/w2day2 copy
cc_loc <-here("CCdata.csv")
cc_data <- read_csv(file = cc_loc)
## New names:
## Rows: 8 Columns: 66
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (66): StartDate, EndDate, Status, IPAddress, Progress, Duration (in seco...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...18`
print(cc_data)
## # A tibble: 8 × 66
##   StartDate    EndDate Status IPAddress Progress Duration (in seconds…¹ Finished
##   <chr>        <chr>   <chr>  <chr>     <chr>    <chr>                  <chr>   
## 1 "Start Date" "End D… "Resp… "IP Addr… "Progre… "Duration (in seconds… "Finish…
## 2 "{\"ImportI… "{\"Im… "{\"I… "{\"Impo… "{\"Imp… "{\"ImportId\":\"dura… "{\"Imp…
## 3 "2025-09-01… "2025-… "IP A… "27.32.7… "100"    "693"                  "True"  
## 4 "2025-09-01… "2025-… "IP A… "49.193.… "100"    "755"                  "True"  
## 5 "2025-09-01… "2025-… "IP A… "172.225… "100"    "1265"                 "True"  
## 6 "2025-09-01… "2025-… "IP A… "128.250… "100"    "1730"                 "True"  
## 7 "2025-09-01… "2025-… "IP A… "194.127… "100"    "841"                  "True"  
## 8 "2025-09-01… "2025-… "IP A… "106.70.… "100"    "432"                  "True"  
## # ℹ abbreviated name: ¹​`Duration (in seconds)`
## # ℹ 59 more variables: RecordedDate <chr>, ResponseId <chr>,
## #   RecipientLastName <chr>, RecipientFirstName <chr>, RecipientEmail <chr>,
## #   ExternalReference <chr>, LocationLatitude <chr>, LocationLongitude <chr>,
## #   DistributionChannel <chr>, UserLanguage <chr>, ...18 <chr>, Age <chr>,
## #   Gender <chr>, Student <chr>, Q1_1 <chr>, Q2_1 <chr>, Q3_1 <chr>,
## #   Q4_1 <chr>, Q5_1 <chr>, Q6_1 <chr>, Q7_1 <chr>, Q8_1 <chr>, Q9_1 <chr>, …
# Vertical Concept Breadth 

# Figuring out which column number vertical concept breadth starts at
which(colnames(cc_data) == "Q1_1")
## [1] 22
which(colnames(cc_data) == "ResponseId")
## [1] 9
#Selected vertical concept breadth columns 
vertical_cb <- cc_data[c(22:31, 9)]
print(vertical_cb)
## # A tibble: 8 × 11
##   Q1_1          Q2_1  Q3_1  Q4_1  Q5_1  Q6_1  Q7_1  Q8_1  Q9_1  Q10_1 ResponseId
##   <chr>         <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>     
## 1 "For each of… "2. … "3. … "4. … "5. … "6. … "7. … "8. … "9. … "10.… "Response…
## 2 "{\"ImportId… "{\"… "{\"… "{\"… "{\"… "{\"… "{\"… "{\"… "{\"… "{\"… "{\"Impor…
## 3 "No"          "No"  "Yes" "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "R_9xJhbq…
## 4 "No"          "No"  "Yes" "Yes" "No"  "No"  "Yes" "Yes" "No"  "No"  "R_9WJs98…
## 5 "No"          "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "No"  "No"  "Yes" "R_9PofNG…
## 6 "No"          "Yes" "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "Yes" "R_42DMxN…
## 7 "Yes"         "No"  "No"  "Yes" "No"  "No"  "Yes" "Yes" "Yes" "Yes" "R_4n15nY…
## 8 "No"          "No"  "Yes" "No"  "No"  "No"  "Yes" "No"  "Yes" "Yes" "R_923cPE…
# Removing unecessary columns 
vertical_clean <- vertical_cb %>%
  slice(-c(1,2))

print(vertical_clean)
## # A tibble: 6 × 11
##   Q1_1  Q2_1  Q3_1  Q4_1  Q5_1  Q6_1  Q7_1  Q8_1  Q9_1  Q10_1 ResponseId       
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>            
## 1 No    No    Yes   No    No    No    No    Yes   No    No    R_9xJhbqYWbPnFya6
## 2 No    No    Yes   Yes   No    No    Yes   Yes   No    No    R_9WJs98ahDCRZEy0
## 3 No    Yes   Yes   Yes   No    Yes   Yes   No    No    Yes   R_9PofNGNuUJ8ZNM5
## 4 No    Yes   No    No    No    No    Yes   No    No    Yes   R_42DMxN1YupUm0jj
## 5 Yes   No    No    Yes   No    No    Yes   Yes   Yes   Yes   R_4n15nYXDmk4Bz2s
## 6 No    No    Yes   No    No    No    Yes   No    Yes   Yes   R_923cPErlCcqjxL3
# Converting Yes / No to numeric 1 / 0 

vertical_CB <- vertical_clean %>%
mutate(across(c(1:10), ~ ifelse(. == "Yes", 1, 0)))
print(vertical_CB)
## # A tibble: 6 × 11
##    Q1_1  Q2_1  Q3_1  Q4_1  Q5_1  Q6_1  Q7_1  Q8_1  Q9_1 Q10_1 ResponseId       
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>            
## 1     0     0     1     0     0     0     0     1     0     0 R_9xJhbqYWbPnFya6
## 2     0     0     1     1     0     0     1     1     0     0 R_9WJs98ahDCRZEy0
## 3     0     1     1     1     0     1     1     0     0     1 R_9PofNGNuUJ8ZNM5
## 4     0     1     0     0     0     0     1     0     0     1 R_42DMxN1YupUm0jj
## 5     1     0     0     1     0     0     1     1     1     1 R_4n15nYXDmk4Bz2s
## 6     0     0     1     0     0     0     1     0     1     1 R_923cPErlCcqjxL3
# Calculating average participant scores 
vertical_avg <- vertical_CB %>%
  rowwise() %>% 
  mutate(avg_score = sum(c_across(1:10), na.rm = TRUE) / 10) %>%
  ungroup()
print(vertical_avg)
## # A tibble: 6 × 12
##    Q1_1  Q2_1  Q3_1  Q4_1  Q5_1  Q6_1  Q7_1  Q8_1  Q9_1 Q10_1 ResponseId       
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>            
## 1     0     0     1     0     0     0     0     1     0     0 R_9xJhbqYWbPnFya6
## 2     0     0     1     1     0     0     1     1     0     0 R_9WJs98ahDCRZEy0
## 3     0     1     1     1     0     1     1     0     0     1 R_9PofNGNuUJ8ZNM5
## 4     0     1     0     0     0     0     1     0     0     1 R_42DMxN1YupUm0jj
## 5     1     0     0     1     0     0     1     1     1     1 R_4n15nYXDmk4Bz2s
## 6     0     0     1     0     0     0     1     0     1     1 R_923cPErlCcqjxL3
## # ℹ 1 more variable: avg_score <dbl>
# Final
Vertical_avg <- vertical_avg %>%
  select(ResponseId, avg_score)
print(Vertical_avg)
## # A tibble: 6 × 2
##   ResponseId        avg_score
##   <chr>                 <dbl>
## 1 R_9xJhbqYWbPnFya6       0.2
## 2 R_9WJs98ahDCRZEy0       0.4
## 3 R_9PofNGNuUJ8ZNM5       0.6
## 4 R_42DMxN1YupUm0jj       0.3
## 5 R_4n15nYXDmk4Bz2s       0.6
## 6 R_923cPErlCcqjxL3       0.4