library(tidyr)
library(dplyr)
df <- read.csv(file="new_data/overview.csv", header=T)
df$term_time2 <- df$term_time
df <- df %>%
separate(term_time2, into = c("term", "time"), sep = "_")
df <- df %>%
mutate(year = case_when(
term %in% c("f18", "w19", "s19") ~ "18-19",
term %in% c("f19", "w20", "s20") ~ "19-20",
term %in% c("f20", "w21", "s21") ~ "20-21",
TRUE ~ NA_character_
))
Unique IDs: 2560
Min Count: 1
Max Count: 10
Avg Count: 1.98
ids <- data.frame(table(df$id))
# nrow(ids)
# min(ids$Freq)
# max(ids$Freq)
# mean(ids$Freq)
ids_counts <- data.frame(table(ids$Freq))
ids_counts
## Var1 Freq
## 1 1 972
## 2 2 1096
## 3 3 222
## 4 4 183
## 5 5 40
## 6 6 35
## 7 7 8
## 8 8 3
## 9 10 1
# Count unique ids by term
unique_id_counts <- df %>%
group_by(term_time) %>%
summarise(unique_ids = n_distinct(id))
unique_id_counts
## # A tibble: 20 x 2
## term_time unique_ids
## <chr> <int>
## 1 f18_po 282
## 2 f18_pr 317
## 3 f19_po 360
## 4 f19_pr 409
## 5 f20_po 199
## 6 f20_pr 221
## 7 f21_po 111
## 8 f21_pr 233
## 9 s19_po 166
## 10 s19_pr 209
## 11 s20_po 216
## 12 s20_pr 276
## 13 s21_po 180
## 14 s21_pr 321
## 15 w19_po 255
## 16 w19_pr 318
## 17 w20_po 235
## 18 w20_pr 323
## 19 w21_po 202
## 20 w21_pr 229
result <- df %>%
group_by(term, id, cid) %>%
summarise(pr_count = sum(time == "pr"),
po_count = sum(time == "po"))
# Filter for ids that have both pr and po entries
result_filtered <- result %>%
filter(pr_count > 0 & po_count > 0)
matched <- data.frame(table(result_filtered$term))
matched
## Var1 Freq
## 1 f18 239
## 2 f19 292
## 3 f20 131
## 4 f21 102
## 5 s19 123
## 6 s20 138
## 7 s21 135
## 8 w19 228
## 9 w20 175
## 10 w21 138
year_matched <- df %>%
group_by(id, year) %>%
summarise(count = n())
year_matched2 <- data.frame(table(year_matched$year, year_matched$count))
year_matched2
## Var1 Var2 Freq
## 1 18-19 1 280
## 2 19-20 1 405
## 3 20-21 1 417
## 4 18-19 2 445
## 5 19-20 2 428
## 6 20-21 2 353
## 7 18-19 3 44
## 8 19-20 3 75
## 9 20-21 3 50
## 10 18-19 4 50
## 11 19-20 4 55
## 12 20-21 4 16
## 13 18-19 5 3
## 14 19-20 5 13
## 15 20-21 5 3
## 16 18-19 6 5
## 17 19-20 6 8
## 18 20-21 6 0
cid_count <- df %>%
group_by(cid) %>%
summarise(appearances = n_distinct(id))
cid_count
## # A tibble: 9 x 2
## cid appearances
## <chr> <int>
## 1 F001A 81
## 2 F008. 1
## 3 F010. 583
## 4 F014. 1
## 5 F040A 890
## 6 F040B 555
## 7 F040C 426
## 8 F041. 751
## 9 F058. 2
df2 <- read.csv(file="new_data/df_long.csv", header=T)
colnames(df2)
## [1] "AnonymousID" "FeltLikeSciencePerson"
## [3] "SeeMyselfSciencePerson" "FamilySeeSciencePerson"
## [5] "InstructorSeeSciencePerson" "PeerSeeSciencePerson"
## [7] "EnjoyScience" "InterestedScience"
## [9] "UnderstandPreviousScience" "UnderstandNewScience"
## [11] "OvercomeSetbacks" "ConfidentOutsideClass"
## [13] "ConfidentExams" "OthersAskHelp"
## [15] "OutsideClassInSubject1" "OutsideClassInSubject2"
## [17] "RealWorldIssues" "FindArticles"
## [19] "CriticallyRead" "IdentifyPatterns"
## [21] "RecognizeArgument" "DevelopArgument"
## [23] "WriteDocuments" "WorkWithOthers"
## [25] "OralPresentation" "Enthusiastic"
## [27] "DiscussWithFriends" "PlanningAdditionalClasses"
## [29] "PursuringCareer" "UnderstandSubject"
## [31] "SucceedSubject" "ComplexIdeas"
## [33] "AskingForHelp" "ConnectIdeas"
## [35] "ApplyingOutsideClass" "SystematicReasoning"
## [37] "AnalyzingData" "ScienceCareer"
## [39] "Career.Goal" "OtherCareer"
## [41] "PreviousCourses" "CurrentlyEmployed"
## [43] "CurrentJobTitle" "Ethnicity"
## [45] "UnlistedEthnicity" "ArmedForces"
## [47] "Time" "Date"
## [49] "Duration" "Gender"
## [51] "BIOL_Crse" "BIOL_Grade"
## [53] "Instructor" "term"