analyzing-data-in-science-analysis

Prepping, loading, setting up

library(tidyverse)
library(googlesheets4)

d <- read_sheet("https://docs.google.com/spreadsheets/d/1gLyAAp6N_5g-Uo-eb61O-eW2apMzzqNdM8fLcBV7V3g/edit#gid=46458265",
                sheet = 1)

qual_q80 <- read_sheet("https://docs.google.com/spreadsheets/d/1gLyAAp6N_5g-Uo-eb61O-eW2apMzzqNdM8fLcBV7V3g/edit#gid=46458265",
                       sheet = "Q 80 - MASTER")

qual_q80 <- qual_q80 %>% janitor::clean_names()

names(qual_q80) <- str_c("qual_q80_", names(qual_q80))

names(qual_q80)[1] <- "response_id"

d <- d %>% 
  left_join(qual_q80)

RQ 1

How do teachers support their students to analyze and interpret data?

open coding - Q80 overall

qualitatively coded based on open/written responses to Q80

overall_freqs <- d %>% 
  select(contains("qual_q80")) %>% 
  select(-c(1, 9)) %>% 
  summarize_all(sum, na.rm = T) %>% 
  gather(key, val) %>% 
  mutate(prop = val/328) %>% 
  arrange(desc(val))

overall_freqs %>% 
  mutate(prop = prop * 100) %>% 
  mutate_if(is.numeric, round, 3) %>% 
  select(-val) %>% 
  knitr::kable()

key	prop
qual_q80_data_visualization_graph_creation	49.695
qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	39.329
qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	35.061
qual_q80_data_summary_summary_table_averaging_math_tasks	27.439
qual_q80_other_those_that_dont_fit_into_other_categories_modeling	21.341
qual_q80_data_application_answering_question_cer_explanations_comparing	20.732
qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	15.244

overall_freqs %>% 
  ggplot(aes(x = reorder(key, prop), y = prop)) +
  geom_col() +
  theme_minimal() +
  coord_flip()

fixed response question– NSSME

4 overlapping

3 maybe overlapping/overlapping a little 2 not

Q98: How often do you have students do each of the following in your class?

Determine which data would need to be collected in order to answer a scientific question (regardless of who generated the question) (1) – YES
Determine which variables from a provided dataset are necessary to answer a scientific question (13) – MAYBE
Organize and/or represent data using tables, charts, or graphs in order to facilitate analysis (4) – YES
Identify potential sources of variability (e.g., measurement error, natural variation) in the data (5) – MAYBE
Analyze data using grade-appropriate methods in order to identify patterns, trends, or relationships (6) – YES
Consider how missing data or measurement error can affect data interpretation (7) – NOT
Select and use grade-appropriate mathematical and/or statistical techniques to analyze data (for example: determining the best measure of central tendency, examining variation in data, or developing a line of best fit) (9) – YES
Use mathematical and/or computational models to generate data to support a scientific claim (10) – NO
Use data and reasoning to defend, verbally or in writing, a claim or refute alternative scientific claims about a real-world phenomenon (regardless of who made the claims) (11) – MAYBE

d %>% 
  select(q98_1:q98_9) %>% 
  gather(key, val) %>% 
  count(key, val) %>% 
  filter(val != "NA") %>% 
  # Calculate total counts per key for percentage calculation
  group_by(key) %>% 
  mutate(total_n = sum(n)) %>% 
  ungroup() %>%
  # Calculate the percentage for each val
  mutate(percentage = (n / total_n) * 100) %>% 
  # Reorder based on "All or almost all science lessons"
  group_by(key) %>% 
  mutate(order_var = sum(n[val == "All or almost all science lessons"], na.rm = TRUE)) %>% 
  ungroup() %>%
  arrange(desc(order_var)) %>%
  # Use the label map for renaming keys
  mutate(key = fct_reorder(key, order_var, .desc = TRUE),
         val = factor(val, levels = c(
           "All or almost all science lessons", 
           "Often (once or twice a week)", 
           "Sometimes (once or twice a month)", 
           "Rarely (a few times a year)", 
           "Never"))) %>% 
  select(key, val, percentage) %>% 
  spread(val, percentage) %>% 
  mutate_if(is.numeric, round, 3) %>% 
  knitr::kable()

key	All or almost all science lessons	Often (once or twice a week)	Sometimes (once or twice a month)	Rarely (a few times a year)	Never
q98_9	9.756	36.280	39.329	11.890	2.744
q98_5	7.927	42.378	39.939	8.841	0.915
q98_3	7.012	42.378	43.598	6.707	0.305
q98_1	4.573	16.463	44.817	32.317	1.829
q98_6	3.049	16.463	38.415	33.841	8.232
q98_8	3.049	19.512	37.805	25.000	14.634
q98_2	2.744	23.171	46.951	24.085	3.049
q98_4	2.439	24.085	42.073	24.695	6.707
q98_7	2.439	18.293	34.756	32.317	12.195

RQ2 - demographics

How does this support differe based on teachers’ grade, level of training and experience, and the subject they teach?

we use what teachers say they are doing in their own words for RQ2 – so we use Q80, not Q98

Grade Level

d_grades <- d %>% 
  select(response_id, grade_level_taught) %>% 
  mutate(grade_level_taught = str_replace(grade_level_taught, ":", "")) %>% 
  separate(col = grade_level_taught, into = c(str_c("grade", letters[1:13])), sep = ",") %>% 
  gather(key, val, -response_id) %>% 
  arrange(response_id) %>% 
  mutate(val = as.integer(val)) %>% 
  mutate(elem = if_else(val <= 5, 1, 0)) %>% 
  mutate(second = if_else(val >= 6, 1, 0)) %>% 
  mutate(middle = if_else(val >= 6 & val <= 8, 1, 0)) %>% 
  mutate(high = if_else(val >= 9, 1, 0)) %>% 
  select(response_id, elem, second, middle, high) %>%
  group_by(response_id) %>% 
  summarize(elem = if_else(any(elem == 1), 1, 0),
            second = if_else(any(second == 1), 1, 0),
            middle = if_else(any(middle == 1), 1, 0),
            high = if_else(any(high == 1), 1, 0)) %>% 
  mutate_if(is.double, replace_na, 0)

d <- d %>% 
  left_join(d_grades)

d %>% 
  count(elem)

# A tibble: 2 × 2
   elem     n
  <dbl> <int>
1     0   298
2     1    32

d %>% 
  select(contains("qual_q80"), elem, middle, high) %>% 
  select(-c(1, 9)) %>% 
  mutate_all(replace_na, 0) %>%
  select(-middle, -high) %>% 
  group_by(elem) %>% 
  summarize_all(funs(sum), na.rm = TRUE) %>% 
  gather(key, val, -elem) %>% 
  filter(elem == 1) %>% 
  mutate(prop = val / 32) %>% 
  arrange(desc(prop)) %>% 
  knitr::kable()

elem	key	val	prop
1	qual_q80_data_visualization_graph_creation	16	0.50000
1	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	15	0.46875
1	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	10	0.31250
1	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	8	0.25000
1	qual_q80_data_summary_summary_table_averaging_math_tasks	8	0.25000
1	qual_q80_data_application_answering_question_cer_explanations_comparing	8	0.25000
1	qual_q80_other_those_that_dont_fit_into_other_categories_modeling	5	0.15625

d %>% 
  count(middle)

# A tibble: 2 × 2
  middle     n
   <dbl> <int>
1      0   216
2      1   114

d %>% 
  select(contains("qual_q80"), elem, middle, high) %>% 
  select(-c(1, 9)) %>% 
  mutate_all(replace_na, 0) %>%
  select(-elem, -high) %>% 
  group_by(middle) %>% 
  summarize_all(funs(sum), na.rm = TRUE) %>% 
  gather(key, val, -middle) %>% 
  filter(middle == 1) %>% 
  mutate(prop = val / 114) %>% 
  mutate(prop = prop * 100) %>% 
  mutate(prop = round(prop, 2)) %>%
  arrange(desc(prop)) %>% 
  knitr::kable()

middle	key	val	prop
1	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	48	42.11
1	qual_q80_data_visualization_graph_creation	48	42.11
1	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	46	40.35
1	qual_q80_data_application_answering_question_cer_explanations_comparing	31	27.19
1	qual_q80_other_those_that_dont_fit_into_other_categories_modeling	25	21.93
1	qual_q80_data_summary_summary_table_averaging_math_tasks	24	21.05
1	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	18	15.79

d %>% 
  count(high)

# A tibble: 2 × 2
   high     n
  <dbl> <int>
1     0    98
2     1   232

d %>% 
  select(contains("qual_q80"), elem, middle, high) %>% 
  select(-c(1, 9)) %>% 
  mutate_all(replace_na, 0) %>%
  select(-elem, -middle) %>% 
  group_by(high) %>% 
  summarize_all(funs(sum), na.rm = TRUE) %>% 
  gather(key, val, -high) %>% 
  filter(high == 1) %>% 
  mutate(prop = val / 232) %>% 
  mutate(prop = prop * 100) %>% 
  mutate(prop = round(prop, 2)) %>%
  arrange(desc(prop)) %>% 
  knitr::kable()

high	key	val	prop
1	qual_q80_data_visualization_graph_creation	125	53.88
1	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	95	40.95
1	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	78	33.62
1	qual_q80_data_summary_summary_table_averaging_math_tasks	71	30.60
1	qual_q80_data_application_answering_question_cer_explanations_comparing	48	20.69
1	qual_q80_other_those_that_dont_fit_into_other_categories_modeling	47	20.26
1	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	35	15.09

Subject area

# Separate the values in the professional_role column into individual rows
d_long <- d %>%
  separate_rows(professional_role, sep = ",")

d_long <- d_long %>% 
  mutate(professional_role_6_text = stringr::str_to_lower(professional_role_6_text)) %>% 
  mutate(professional_role_6_text = str_detect(professional_role_6_text, "environmental")) # double-check this works

# Trim any leading/trailing whitespace
d_long$professional_role <- trimws(d_long$professional_role)

# Create dichotomous columns for each unique professional role
d_wide <- d_long %>%
  mutate(value = 1) %>%
  pivot_wider(names_from = professional_role, values_from = value, values_fill = 0)

d_wide <- d_wide %>% 
  select(contains("qual_q80"), Biology:Physics) %>% 
  select(-c(1, 9)) %>% 
  janitor::clean_names()

d_wide <- d_wide %>% 
  mutate_all(as.numeric) %>% 
  mutate_all(replace_na, 0)

# Identify task and subject columns
task_columns <- grep("^qual_q80", names(d_wide), value = TRUE)
subject_columns <- c("biology", "other_please_describe", "general_science",
                     "earth_science_or_earth_and_space_science", "chemistry", "physics")

# Function to calculate percentages for a single subject
calculate_percentages_by_subject <- function(subject_col) {
  total_subject <- sum(d_wide[[subject_col]] == 1, na.rm = TRUE)  # Total number of respondents for this subject
  
  d_wide %>%
    filter(!!sym(subject_col) == 1) %>%  # Only include rows where the subject was selected
    summarise(across(all_of(task_columns), ~ mean(. == 1, na.rm = TRUE) * 100)) %>%
    mutate(subject = subject_col, total_responses = total_subject)  # Add subject name and total responses
}

# Apply the function to all subject columns and bind the results
percentages_by_subject <- bind_rows(lapply(subject_columns, calculate_percentages_by_subject))

percentages_by_subject %>% 
  select(subject, total_responses, everything()) %>% 
  mutate_if(is.numeric, round, 2) %>%
  knitr::kable() # other is environmental

subject	total_responses	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	qual_q80_data_summary_summary_table_averaging_math_tasks	qual_q80_data_visualization_graph_creation	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	qual_q80_data_application_answering_question_cer_explanations_comparing	qual_q80_other_those_that_dont_fit_into_other_categories_modeling
biology	191	33.51	17.28	28.27	54.97	39.27	22.51	20.94
other_please_describe	151	29.80	14.57	25.83	50.99	43.05	22.52	23.84
general_science	99	40.40	16.16	21.21	47.47	39.39	18.18	24.24
earth_science_or_earth_and_space_science	86	30.23	13.95	25.58	37.21	40.70	22.09	23.26
chemistry	73	36.99	13.70	26.03	52.05	35.62	24.66	17.81
physics	53	32.08	16.98	28.30	39.62	41.51	16.98	18.87

years worked

This tells us how these items correlate with years worked

d %>% 
  select(contains("qual_q80"), years_worked) %>% 
  select(-c(1, 9)) %>% 
  mutate(years_worked = unlist(years_worked)) %>% 
  mutate(years_worked = as.numeric(years_worked)) %>% 
  filter(years_worked < 100) %>% 
  mutate_if(is.numeric, replace_na, 0) %>% 
  corrr::correlate() %>% 
  corrr::focus(years_worked) %>% 
  arrange(desc(years_worked))

# A tibble: 7 × 2
  term                                                              years_worked
  <chr>                                                                    <dbl>
1 qual_q80_data_interpretation_identify_and_interpret_a_graph_or_d…      0.0652 
2 qual_q80_data_collection_their_own_or_finding_data_recording_fir…      0.0517 
3 qual_q80_data_summary_summary_table_averaging_math_tasks               0.0472 
4 qual_q80_data_visualization_graph_creation                             0.0320 
5 qual_q80_data_curation_tables_organizing_entering_already_collec…      0.0122 
6 qual_q80_data_application_answering_question_cer_explanations_co…      0.00850
7 qual_q80_other_those_that_dont_fit_into_other_categories_modeling     -0.00808

prior experience

CP: not sure this is helpful

d %>% 
  count(q98)

# A tibble: 117 × 2
   q98                                                                         n
   <chr>                                                                   <int>
 1 Graduate level coursework                                                  10
 2 Graduate level coursework,Research Experience for Teachers (NSF RET)        1
 3 Graduate level coursework,Research experience                              10
 4 Graduate level coursework,Research experience,Research Experience for …     1
 5 Graduate level coursework,Research experience,Teacher professional dev…     6
 6 Graduate level coursework,Taught myself                                     3
 7 Graduate level coursework,Taught myself,Research Experience for Teache…     1
 8 Graduate level coursework,Taught myself,Research experience                 3
 9 Graduate level coursework,Taught myself,Research experience,Research E…     1
10 Graduate level coursework,Taught myself,Research experience,Teacher pr…     3
# ℹ 107 more rows

d_long_prior <- d %>% 
  separate_rows(q98, sep = ",")

d_long_prior %>% 
  count(q98) %>% 
  arrange(desc(n))

# A tibble: 11 × 2
   q98                                            n
   <chr>                                      <int>
 1 Undergraduate course in science              246
 2 Graduate level coursework                    209
 3 High school course in science                190
 4 Undergraduate course in math                 179
 5 High school course in math                   172
 6 Teacher professional development             172
 7 Taught myself                                154
 8 Research experience                          139
 9 Research Experience for Teachers (NSF RET)    30
10 NA                                             2
11 None                                           2

p <- d_long_prior %>% 
  select(contains("qual_q80"), prior_exp = q98) %>% 
  select(-c(1, 9)) %>% 
  mutate_if(is.numeric, replace_na, 0) %>% 
  gather(key, val, -prior_exp) %>%
  group_by(prior_exp, key) %>% 
  summarize(mean = mean(val)) %>% 
  mutate(mean = mean * 100)

p %>% 
  spread(key, mean) %>% 
  mutate_if(is.numeric, round, 3) %>% 
  filter(prior_exp != "NA") %>% 
  knitr::kable()

prior_exp	qual_q80_data_application_answering_question_cer_explanations_comparing	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	qual_q80_data_summary_summary_table_averaging_math_tasks	qual_q80_data_visualization_graph_creation	qual_q80_other_those_that_dont_fit_into_other_categories_modeling
Graduate level coursework	23.923	33.971	16.746	41.148	30.144	49.761	21.531
High school course in math	23.256	37.209	19.186	44.767	29.651	56.395	15.116
High school course in science	23.684	35.789	17.368	43.684	26.842	55.789	16.316
None	50.000	0.000	0.000	0.000	0.000	0.000	50.000
Research experience	21.583	40.288	15.827	39.568	30.216	53.957	20.863
Research Experience for Teachers (NSF RET)	23.333	56.667	23.333	40.000	50.000	53.333	23.333
Taught myself	20.130	34.416	14.935	46.753	28.571	57.792	18.182
Teacher professional development	18.023	38.953	16.279	43.023	31.977	51.163	22.093
Undergraduate course in math	23.464	36.313	16.201	41.899	32.402	55.866	14.525
Undergraduate course in science	24.390	36.179	17.073	42.276	29.268	56.098	15.854

who does this - state (NGSS status)

state_data <- read_sheet("https://docs.google.com/spreadsheets/d/1wul5jRNZBVqZP53swygLD34sBXPu0xEyJyOqRXLG6Dc/edit?gid=1207018323#gid=1207018323")

state_data <- state_data %>% 
  select(state = State, status = Status)

state_data

# A tibble: 51 × 2
   state        status 
   <chr>        <chr>  
 1 Maine        NGSS   
 2 Minnesota    Adapted
 3 Kansas       NGSS   
 4 Rhode Island NGSS   
 5 Vermont      NGSS   
 6 Kentucky     NGSS   
 7 Maryland     NGSS   
 8 California   NGSS   
 9 Delaware     NGSS   
10 Washington   NGSS   
# ℹ 41 more rows

d %>% 
  select(contains("qual_q80"), state_work) %>%
  select(-c(1, 9)) %>% 
  rename(state = state_work) %>% 
  left_join(state_data, by = "state") %>% 
  mutate_if(is.numeric, replace_na, 0) %>% 
  gather(key, val, -status) %>% 
  group_by(status, key) %>% 
  mutate(val = as.integer(val)) %>% 
  summarize(mean = mean(val)) %>% 
  mutate(mean = mean * 100) %>% 
  spread(key, mean) %>% 
  filter(!is.na(status)) %>% 
  mutate_if(is.numeric, round, 2) %>% 
  knitr::kable()

status	qual_q80_data_application_answering_question_cer_explanations_comparing	qual_q80_data_collection_their_own_or_finding_data_recording_first_hand_data	qual_q80_data_curation_tables_organizing_entering_already_collected_or_recorded_data_tidy_data	qual_q80_data_interpretation_identify_and_interpret_a_graph_or_data_looking_for_patterns	qual_q80_data_summary_summary_table_averaging_math_tasks	qual_q80_data_visualization_graph_creation	qual_q80_other_those_that_dont_fit_into_other_categories_modeling	state
Adapted	17.21	40.16	16.39	32.79	27.87	45.08	23.77	NA
NGSS	26.45	29.75	14.05	44.63	28.10	52.07	14.88	NA
Not	16.95	32.20	15.25	40.68	20.34	57.63	28.81	NA

make sure our data is relevant as of 2021 - https://www.nsta.org/science-standards