###Introduction

According to to the article ‘Cell suppression: Disclosure protection for sensitive tabular data’, “When a statistical agency, such as the United States Bureau of the Census, publishes tabular data, it must withhold certain data elements that contain confidential information associated with the data respondents. Cell suppression is a technique commonly used in the publishing of economic data in tabular formats.” An example of this is the data used for analysis in this project. The College Scorecard is an online government website that has financial information pertinent to the cost of post secondary education. As per The College Scorecard cell suppression techniques were used to “reduce the risk of disclosure of confidential or indentifying.”

Libraries Used

library(dplyr)
library(tidyverse)
library(lubridate)
library(tidyquant)

Importing CSV File

df<-read_csv("https://raw.githubusercontent.com/engine2031/Data-607/main/Project%202_Privacy%20Suppresion%20Education%20Data2.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   `Credential level` = col_character(),
##   Metric = col_character(),
##   `Reportable
## fields of study
## (UNITID level)` = col_character(),
##   `Proportion of
## graduates in
## reportable
## fields of study
## (IPEDS)` = col_character(),
##   `Reportable
## fields of
## study
## (OPEID6
## level)1` = col_character(),
##   `Proportion of
## graduates who
## received federal
## financial aid in
## reportable fields
## of study` = col_character()
## )
head(df)
## # A tibble: 6 x 6
##   `Credential lev~ Metric `Reportable\nfi~ `Proportion of\~ `Reportable\nfi~
##   <chr>            <chr>  <chr>            <chr>            <chr>           
## 1 "All"            Media~ 24.40%           68.00%           32.60%          
## 2  <NA>            Media~ 29.00%           70.70%           40.20%          
## 3  <NA>            Both   21.10%           63.25%           27.80%          
## 4 "Undergraduate\~ Media~ 15.20%           41.50%           24.20%          
## 5  <NA>            Media~ 17.70%           44.30%           32.20%          
## 6  <NA>            Both   12.00%           34.10%           17.80%          
## # ... with 1 more variable: `Proportion of\ngraduates who\nreceived
## #   federal\nfinancial aid in\nreportable fields\nof study` <chr>

Filling 1st column blank values

df2 <- df %>% mutate(Credential_Level = na.locf(df$'Credential level')) %>% 
  relocate(Credential_Level, .after = `Credential level`)

df3 <- df2 %>% select(-'Credential level')
head(df3)
## # A tibble: 6 x 6
##   Credential_Level Metric `Reportable\nfi~ `Proportion of\~ `Reportable\nfi~
##   <chr>            <chr>  <chr>            <chr>            <chr>           
## 1 "All"            Media~ 24.40%           68.00%           32.60%          
## 2 "All"            Media~ 29.00%           70.70%           40.20%          
## 3 "All"            Both   21.10%           63.25%           27.80%          
## 4 "Undergraduate\~ Media~ 15.20%           41.50%           24.20%          
## 5 "Undergraduate\~ Media~ 17.70%           44.30%           32.20%          
## 6 "Undergraduate\~ Both   12.00%           34.10%           17.80%          
## # ... with 1 more variable: `Proportion of\ngraduates who\nreceived
## #   federal\nfinancial aid in\nreportable fields\nof study` <chr>

Renaming of columns names to shorten length

df4 <- rename(df3, 'Field of Study Type 1' = c(3), 'Graduates' = c(4), 'Field of Study Type 2' = c(5), 'Graduates with Financial Aid' = c(6))
head(df4)
## # A tibble: 6 x 6
##   Credential_Level Metric `Field of Study~ Graduates `Field of Study~
##   <chr>            <chr>  <chr>            <chr>     <chr>           
## 1 "All"            Media~ 24.40%           68.00%    32.60%          
## 2 "All"            Media~ 29.00%           70.70%    40.20%          
## 3 "All"            Both   21.10%           63.25%    27.80%          
## 4 "Undergraduate\~ Media~ 15.20%           41.50%    24.20%          
## 5 "Undergraduate\~ Media~ 17.70%           44.30%    32.20%          
## 6 "Undergraduate\~ Both   12.00%           34.10%    17.80%          
## # ... with 1 more variable: `Graduates with Financial Aid` <chr>

Changing data structure from wide to long

df5 <- df4 %>% pivot_longer(c(`Field of Study Type 1`, 'Graduates', `Field of Study Type 2`, `Graduates with Financial Aid`), names_to = 'Education Type', values_to = 'Reportable_Data')
df5
## # A tibble: 108 x 4
##    Credential_Level Metric          `Education Type`             Reportable_Data
##    <chr>            <chr>           <chr>                        <chr>          
##  1 All              Median debt     Field of Study Type 1        24.40%         
##  2 All              Median debt     Graduates                    68.00%         
##  3 All              Median debt     Field of Study Type 2        32.60%         
##  4 All              Median debt     Graduates with Financial Aid 81.40%         
##  5 All              Median earnings Field of Study Type 1        29.00%         
##  6 All              Median earnings Graduates                    70.70%         
##  7 All              Median earnings Field of Study Type 2        40.20%         
##  8 All              Median earnings Graduates with Financial Aid 90.10%         
##  9 All              Both            Field of Study Type 1        21.10%         
## 10 All              Both            Graduates                    63.25%         
## # ... with 98 more rows

String Manipulation to clean up the data.

df5$Credential_Level  <- str_replace_all(df5$Credential_Level, "\\n", " ")
df5$'Reportable_Data' <- str_remove(df5$'Reportable_Data', "%")
df5$'Reportable_Data' <- as.integer(df5$'Reportable_Data')

head(df5)
## # A tibble: 6 x 4
##   Credential_Level Metric          `Education Type`             Reportable_Data
##   <chr>            <chr>           <chr>                                  <int>
## 1 All              Median debt     Field of Study Type 1                     24
## 2 All              Median debt     Graduates                                 68
## 3 All              Median debt     Field of Study Type 2                     32
## 4 All              Median debt     Graduates with Financial Aid              81
## 5 All              Median earnings Field of Study Type 1                     29
## 6 All              Median earnings Graduates                                 70

Filter Data in preperation for Data Visualization

field_of_study <- df5 %>% filter(Metric=='Median debt') %>% 
    filter(Credential_Level != 'All') %>% filter(`Education Type`!='Graduates')%>%
    filter(`Education Type`!='Graduates with Financial Aid')
head(field_of_study)
## # A tibble: 6 x 4
##   Credential_Level           Metric      `Education Type`      Reportable_Data
##   <chr>                      <chr>       <chr>                           <int>
## 1 Undergraduate Certificates Median debt Field of Study Type 1              15
## 2 Undergraduate Certificates Median debt Field of Study Type 2              24
## 3 Associate’s Degrees        Median debt Field of Study Type 1              18
## 4 Associate’s Degrees        Median debt Field of Study Type 2              22
## 5 Bachelor’s Degrees         Median debt Field of Study Type 1              40
## 6 Bachelor’s Degrees         Median debt Field of Study Type 2              44

Data Visualization for Report Data for Median Debt

field_of_study %>%
  ggplot(aes(x=reorder(Credential_Level,Reportable_Data),y=Reportable_Data, fill = `Education Type`)) + 
  geom_bar(stat = 'identity',position=position_dodge()) +
  scale_fill_brewer(palette="Paired") +
  labs(y = ("Reportable Data (%)"),x = ("Credential Level"),
      title = ("Reportable Data per Credential Level"))+ 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 65, hjust = 1))

##References https://onlinelibrary.wiley.com/doi/abs/10.1002/net.3230220407#:~:text=Cell%20suppression%20is%20a%20technique,economic%20data%20in%20tabular%20formats.&text=Additional%20entries%20in%20the%20table,value%20of%20each%20primary%20suppression.