Data Cleaning

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'tibble' was built under R version 4.4.2

## Warning: package 'tidyr' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.2

## Warning: package 'purrr' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.2

## Warning: package 'stringr' was built under R version 4.4.2

## Warning: package 'forcats' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyr)
library(purrr)
library(stringr)
library(ggplot2)
library(readr)

combined <- read_csv("combined.csv")

## Rows: 479 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): DBN, school_name, boro
## dbl (27): Num of SAT Test Takers, SAT Critical Reading Avg. Score, SAT Math ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

survey_d75 <- read_tsv("survey_d75.txt")

## Rows: 56 Columns: 1773
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1739): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl   (29): p_q5, p_q9, p_q13a, p_q13b, p_q13c, p_q13d, p_q14a, p_q14b, p_q1...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

survey_all <- read_tsv("survey_all.txt")

## Rows: 1646 Columns: 1942
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1904): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl   (33): p_q1, p_q3d, p_q9, p_q10, p_q12aa, p_q12ab, p_q12ac, p_q12ad, p_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

filter survey data to include only high schools and selects columns needed for the analysis

survey_select <- survey_all %>% 
  filter(schooltype == "High School") %>% 
  select(dbn:aca_tot_11)

select column fir suvery_d75

surveryd75_select <- survey_d75 %>% 
  select(dbn : aca_tot_11)

Combine Survey_all and Survery_d75

survery_total <- survey_select %>% 
  bind_rows(surveryd75_select)

Rename the dbn to DBN

survery_total <- survery_total %>% 
  rename(DBN = dbn)

Join the survery total and combined . Using left_join()

combined_survery <- combined %>% 
  left_join(survery_total , by = "DBN")

correlation matrix to look for interesting relationship between pairs and variables

cor_mat <- combined_survery %>% 
  select(avg_sat_score , saf_p_11:aca_tot_11) %>% 
  cor(use = "pairwise.complete.obs")


cor_tib <- cor_mat %>% 
  as_tibble(rownames = "variables")

Looking the for strong correlations

strong_cors <- cor_tib %>%
  select(variables, avg_sat_score) %>%
  filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)

make the scatter plots foravg_sat_score

create_scatter <- function(x, y) {
  ggplot(data = combined_survery ) + aes_string(x = x , y =y ) +geom_point(alpha=0.3) + theme(panel.background = element_rect(fill = "white"))
}

x_var <- strong_cors$variables[2:5]
y_var <- "avg_sat_score"

map2(x_var , y_var, create_scatter)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [[1]]

## Warning: Removed 137 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[2]]

## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[3]]

## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[4]]

## Warning: Removed 137 rows containing missing values or values outside the scale range
## (`geom_point()`).

Reshape the data so that I can investigate the difference in student , parent , teacher responses to data

combined_gathered_survery <- combined_survery %>% 
  pivot_longer(cols = saf_p_11 : aca_tot_11 ,
               names_to = "survery_questions" ,
               values_to = "score")

Use str_sub() to create new variables, response_type and question, from the survey_question variable.

combined_gathered_survery <- combined_gathered_survery %>% 
  mutate(response_type = str_sub(survery_questions , 4 ,6)) %>% 
  mutate(question = str_sub(survery_questions , 1,3))

Replace response_type variable values with names “parent”, “teacher”, “student”, “total” using if_else() function.

combined_gathered_survery <- combined_gathered_survery %>% 
  mutate(response_type = ifelse(response_type == "_p_" , "parent" ,
                                ifelse(response_type == "_t_" , "teacher",
                                       ifelse(response_type == "_s_" , "student",
                                              ifelse(response_type == "_to_" , "total" , "NA")))))

Make a boxplot to see if there appear to be differences in how the three groups of responders (parents, students, and teachers) answered the four questions.

combined_gathered_survery %>%
  filter(response_type != "total") %>%
  ggplot(aes(x = question, y = score, fill = response_type)) +
  geom_boxplot()

## Warning: Removed 1688 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Data Cleaning

Aman Chaudhary

2025-01-12

R Markdown

Including Plots