Project 2

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work.

I going tol get the data from a .CSV file that I created.

college_data<-read.csv('https://raw.githubusercontent.com/vitugo23/DATA607/main/project2/grad_rates_data.csv')
dim(college_data)
## [1]  8 13
head(college_data)
##                  X       X.1    X.2                    X.3          X.4
## 1                                   5 year graduation rate total_number
## 2        Caucasian 2,410,070 59.10%                 62.20%   1499063.54
## 3         Hispanic   532,720   13.1                 41.50%     221078.8
## 4 African American   452,760 11.10%                 40.50%     183367.8
## 5    Non-Residents   272,410  6.70%                 10.15%    27649.615
## 6            Asian   211,459  6.60%                 69.30%   146541.087
##                      X.5 Number.of.College.Graduates.by.Race.or.Ethnicity.2024
## 1 associates degree rate                                                 total
## 2                     7%                                              168704.9
## 3                 40.10%                                             213620.72
## 4                     3%                                               13582.8
## 5                     8%                                               21792.8
## 6                 21.70%                                             45886.60%
##                     X.6        X.7                  X.8       X.9
## 1 bachelors degree rate      total master's degree rate     total
## 2                   51%  1229135.7                   7%  168704.9
## 3                45.30%  241322.16               12.30%  65524.56
## 4                    2%     9055.2                   5%     22638
## 5                32.00%    87171.2               52.10% 141925.61
## 6                53.10% 112284.73%               17.70% 37428.24%
##                    X.10      X.11
## 1 doctorate degree rate     total
## 2                    6%  144604.2
## 3                 2.30%  12252.56
## 4                    1%    4527.6
## 5                 8.20%  22337.62
## 6                 7.50% 15859.43%
tail(college_data)
##                  X     X.1    X.2    X.3        X.4    X.5
## 3         Hispanic 532,720   13.1 41.50%   221078.8 40.10%
## 4 African American 452,760 11.10% 40.50%   183367.8     3%
## 5    Non-Residents 272,410  6.70% 10.15%  27649.615     8%
## 6            Asian 211,459  6.60% 69.30% 146541.087 21.70%
## 7      Multiracial 116,350  2.90% 50.40%    58640.4 27.40%
## 8  American Indian  24,820  0.60% 39.30%    9754.26     9%
##   Number.of.College.Graduates.by.Race.or.Ethnicity.2024    X.6        X.7
## 3                                             213620.72 45.30%  241322.16
## 4                                               13582.8     2%     9055.2
## 5                                               21792.8 32.00%    87171.2
## 6                                             45886.60% 53.10% 112284.73%
## 7                                             31879.90% 54.50%  63410.75%
## 8                                                2233.8     9%     2233.8
##      X.8       X.9  X.10      X.11
## 3 12.30%  65524.56 2.30%  12252.56
## 4     5%     22638    1%    4527.6
## 5 52.10% 141925.61 8.20%  22337.62
## 6 17.70% 37428.24% 7.50% 15859.43%
## 7 14.80% 17219.80% 3.30%  3839.55%
## 8     5%      1241    7%    1737.4

Load the libraries required for the project.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

Name columns on data frame.

colnames(college_data)[1] = "race"
colnames(college_data)[2] = "total_graduates"
colnames(college_data)[3] = "Percentage"
colnames(college_data)[4] = "5Y_Grad_rate"
colnames(college_data)[5] = "total1"
colnames(college_data)[6] = "assos_dg_rate"
colnames(college_data)[7] = "total2"
colnames(college_data)[8] = "bachelor_dg_rate"
colnames(college_data)[9] = "total3"
colnames(college_data)[10] = "Master_dg_rate"
colnames(college_data)[11] = "total4"
colnames(college_data)[12] = "PHD_dg_rate"
colnames(college_data)[13] = "total5"
total_graduates <- colnames(college_data[2:length(college_data)])
glimpse(college_data)
## Rows: 8
## Columns: 13
## $ race             <chr> "", "Caucasian", "Hispanic", "African American", "Non…
## $ total_graduates  <chr> "", "2,410,070", "532,720", "452,760", "272,410", "21…
## $ Percentage       <chr> "", "59.10%", "13.1", "11.10%", "6.70%", "6.60%", "2.…
## $ `5Y_Grad_rate`   <chr> "5 year graduation rate", "62.20%", "41.50%", "40.50%…
## $ total1           <chr> "total_number", "1499063.54", "221078.8", "183367.8",…
## $ assos_dg_rate    <chr> "associates degree rate", "7%", "40.10%", "3%", "8%",…
## $ total2           <chr> "total", "168704.9", "213620.72", "13582.8", "21792.8…
## $ bachelor_dg_rate <chr> "bachelors degree rate", "51%", "45.30%", "2%", "32.0…
## $ total3           <chr> "total", "1229135.7", "241322.16", "9055.2", "87171.2…
## $ Master_dg_rate   <chr> "master's degree rate", "7%", "12.30%", "5%", "52.10%…
## $ total4           <chr> "total", "168704.9", "65524.56", "22638", "141925.61"…
## $ PHD_dg_rate      <chr> "doctorate degree rate", "6%", "2.30%", "1%", "8.20%"…
## $ total5           <chr> "total", "144604.2", "12252.56", "4527.6", "22337.62"…

Remove tittle rows in data frame.

tidycollege <- tail(college_data, -1) %>%
  select(-c(1))

head(college_data)
##               race total_graduates Percentage           5Y_Grad_rate
## 1                                             5 year graduation rate
## 2        Caucasian       2,410,070     59.10%                 62.20%
## 3         Hispanic         532,720       13.1                 41.50%
## 4 African American         452,760     11.10%                 40.50%
## 5    Non-Residents         272,410      6.70%                 10.15%
## 6            Asian         211,459      6.60%                 69.30%
##         total1          assos_dg_rate    total2      bachelor_dg_rate
## 1 total_number associates degree rate     total bachelors degree rate
## 2   1499063.54                     7%  168704.9                   51%
## 3     221078.8                 40.10% 213620.72                45.30%
## 4     183367.8                     3%   13582.8                    2%
## 5    27649.615                     8%   21792.8                32.00%
## 6   146541.087                 21.70% 45886.60%                53.10%
##       total3       Master_dg_rate    total4           PHD_dg_rate    total5
## 1      total master's degree rate     total doctorate degree rate     total
## 2  1229135.7                   7%  168704.9                    6%  144604.2
## 3  241322.16               12.30%  65524.56                 2.30%  12252.56
## 4     9055.2                   5%     22638                    1%    4527.6
## 5    87171.2               52.10% 141925.61                 8.20%  22337.62
## 6 112284.73%               17.70% 37428.24%                 7.50% 15859.43%
tidycollege$Item
## NULL

Remove commas, and change values from string to numerical.

college_data[total_graduates] <- college_data[total_graduates] %>% apply(MARGIN = 2, FUN = function(x) as.numeric(str_remove(x,",")))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion
print(total_graduates)
##  [1] "total_graduates"  "Percentage"       "5Y_Grad_rate"     "total1"          
##  [5] "assos_dg_rate"    "total2"           "bachelor_dg_rate" "total3"          
##  [9] "Master_dg_rate"   "total4"           "PHD_dg_rate"      "total5"

Change format of data to long format

l_data <- college_data %>% pivot_longer(cols=total_graduates,names_to = "Comparison", values_to = "total_Comp")
print(l_data)
## # A tibble: 8 × 14
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 7 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>
l_data <- l_data %>% mutate(Race = ifelse(race == "", NA, race))

print(l_data)
## # A tibble: 8 × 15
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## #   Race <chr>
l_data <- l_data %>% fill(race)
print(l_data)
## # A tibble: 8 × 15
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## #   Race <chr>
l_data <- l_data %>% janitor::clean_names()

print(l_data)
## # A tibble: 8 × 15
##   race   percentage x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>       <dbl>         <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""           NA              NA NA                 NA     NA                NA
## 2 "Cauc…       NA              NA  1.50e6            NA 168705.               NA
## 3 "Hisp…       13.1            NA  2.21e5            NA 213621.               NA
## 4 "Afri…       NA              NA  1.83e5            NA  13583.               NA
## 5 "Non-…       NA              NA  2.76e4            NA  21793.               NA
## 6 "Asia…       NA              NA  1.47e5            NA     NA                NA
## 7 "Mult…       NA              NA  5.86e4            NA     NA                NA
## 8 "Amer…       NA              NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, master_dg_rate <dbl>, total4 <dbl>,
## #   phd_dg_rate <dbl>, total5 <dbl>, comparison <chr>, total_comp <dbl>,
## #   race_2 <chr>
t_data <- l_data %>% spread(key=percentage, value=comparison)

print(t_data)
## # A tibble: 8 × 15
##   race      x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate  total3
##   <chr>             <dbl>   <dbl>         <dbl>   <dbl>            <dbl>   <dbl>
## 1 ""                   NA NA                 NA     NA                NA NA     
## 2 "African…            NA  1.83e5            NA  13583.               NA  9.06e3
## 3 "America…            NA  9.75e3            NA   2234.               NA  2.23e3
## 4 "Asian"              NA  1.47e5            NA     NA                NA NA     
## 5 "Caucasi…            NA  1.50e6            NA 168705.               NA  1.23e6
## 6 "Hispani…            NA  2.21e5            NA 213621.               NA  2.41e5
## 7 "Multira…            NA  5.86e4            NA     NA                NA NA     
## 8 "Non-Res…            NA  2.76e4            NA  21793.               NA  8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## #   total5 <dbl>, total_comp <dbl>, race_2 <chr>, `13.1` <chr>, `<NA>` <chr>
t_data <- t_data %>% janitor::clean_names()

print(t_data)
## # A tibble: 8 × 15
##   race      x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate  total3
##   <chr>             <dbl>   <dbl>         <dbl>   <dbl>            <dbl>   <dbl>
## 1 ""                   NA NA                 NA     NA                NA NA     
## 2 "African…            NA  1.83e5            NA  13583.               NA  9.06e3
## 3 "America…            NA  9.75e3            NA   2234.               NA  2.23e3
## 4 "Asian"              NA  1.47e5            NA     NA                NA NA     
## 5 "Caucasi…            NA  1.50e6            NA 168705.               NA  1.23e6
## 6 "Hispani…            NA  2.21e5            NA 213621.               NA  2.41e5
## 7 "Multira…            NA  5.86e4            NA     NA                NA NA     
## 8 "Non-Res…            NA  2.76e4            NA  21793.               NA  8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## #   total5 <dbl>, total_comp <dbl>, race_2 <chr>, x13_1 <chr>, na <chr>
library(ggplot2)
ggplot(college_data) +
  geom_point(mapping = aes(x=race, y=total_graduates))
## Warning: Removed 2 rows containing missing values (`geom_point()`).