Project 2

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work.

I going tol get the data from a .CSV file that I created.

college_data<-read.csv('https://raw.githubusercontent.com/vitugo23/DATA607/main/project2/grad_rates_data.csv')
dim(college_data)

## [1]  8 13

head(college_data)

##                  X       X.1    X.2                    X.3          X.4
## 1                                   5 year graduation rate total_number
## 2        Caucasian 2,410,070 59.10%                 62.20%   1499063.54
## 3         Hispanic   532,720   13.1                 41.50%     221078.8
## 4 African American   452,760 11.10%                 40.50%     183367.8
## 5    Non-Residents   272,410  6.70%                 10.15%    27649.615
## 6            Asian   211,459  6.60%                 69.30%   146541.087
##                      X.5 Number.of.College.Graduates.by.Race.or.Ethnicity.2024
## 1 associates degree rate                                                 total
## 2                     7%                                              168704.9
## 3                 40.10%                                             213620.72
## 4                     3%                                               13582.8
## 5                     8%                                               21792.8
## 6                 21.70%                                             45886.60%
##                     X.6        X.7                  X.8       X.9
## 1 bachelors degree rate      total master's degree rate     total
## 2                   51%  1229135.7                   7%  168704.9
## 3                45.30%  241322.16               12.30%  65524.56
## 4                    2%     9055.2                   5%     22638
## 5                32.00%    87171.2               52.10% 141925.61
## 6                53.10% 112284.73%               17.70% 37428.24%
##                    X.10      X.11
## 1 doctorate degree rate     total
## 2                    6%  144604.2
## 3                 2.30%  12252.56
## 4                    1%    4527.6
## 5                 8.20%  22337.62
## 6                 7.50% 15859.43%

tail(college_data)

##                  X     X.1    X.2    X.3        X.4    X.5
## 3         Hispanic 532,720   13.1 41.50%   221078.8 40.10%
## 4 African American 452,760 11.10% 40.50%   183367.8     3%
## 5    Non-Residents 272,410  6.70% 10.15%  27649.615     8%
## 6            Asian 211,459  6.60% 69.30% 146541.087 21.70%
## 7      Multiracial 116,350  2.90% 50.40%    58640.4 27.40%
## 8  American Indian  24,820  0.60% 39.30%    9754.26     9%
##   Number.of.College.Graduates.by.Race.or.Ethnicity.2024    X.6        X.7
## 3                                             213620.72 45.30%  241322.16
## 4                                               13582.8     2%     9055.2
## 5                                               21792.8 32.00%    87171.2
## 6                                             45886.60% 53.10% 112284.73%
## 7                                             31879.90% 54.50%  63410.75%
## 8                                                2233.8     9%     2233.8
##      X.8       X.9  X.10      X.11
## 3 12.30%  65524.56 2.30%  12252.56
## 4     5%     22638    1%    4527.6
## 5 52.10% 141925.61 8.20%  22337.62
## 6 17.70% 37428.24% 7.50% 15859.43%
## 7 14.80% 17219.80% 3.30%  3839.55%
## 8     5%      1241    7%    1737.4

Load the libraries required for the project.

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

Name columns on data frame.

colnames(college_data)[1] = "race"
colnames(college_data)[2] = "total_graduates"
colnames(college_data)[3] = "Percentage"
colnames(college_data)[4] = "5Y_Grad_rate"
colnames(college_data)[5] = "total1"
colnames(college_data)[6] = "assos_dg_rate"
colnames(college_data)[7] = "total2"
colnames(college_data)[8] = "bachelor_dg_rate"
colnames(college_data)[9] = "total3"
colnames(college_data)[10] = "Master_dg_rate"
colnames(college_data)[11] = "total4"
colnames(college_data)[12] = "PHD_dg_rate"
colnames(college_data)[13] = "total5"
total_graduates <- colnames(college_data[2:length(college_data)])
glimpse(college_data)

## Rows: 8
## Columns: 13
## $ race             <chr> "", "Caucasian", "Hispanic", "African American", "Non…
## $ total_graduates  <chr> "", "2,410,070", "532,720", "452,760", "272,410", "21…
## $ Percentage       <chr> "", "59.10%", "13.1", "11.10%", "6.70%", "6.60%", "2.…
## $ `5Y_Grad_rate`   <chr> "5 year graduation rate", "62.20%", "41.50%", "40.50%…
## $ total1           <chr> "total_number", "1499063.54", "221078.8", "183367.8",…
## $ assos_dg_rate    <chr> "associates degree rate", "7%", "40.10%", "3%", "8%",…
## $ total2           <chr> "total", "168704.9", "213620.72", "13582.8", "21792.8…
## $ bachelor_dg_rate <chr> "bachelors degree rate", "51%", "45.30%", "2%", "32.0…
## $ total3           <chr> "total", "1229135.7", "241322.16", "9055.2", "87171.2…
## $ Master_dg_rate   <chr> "master's degree rate", "7%", "12.30%", "5%", "52.10%…
## $ total4           <chr> "total", "168704.9", "65524.56", "22638", "141925.61"…
## $ PHD_dg_rate      <chr> "doctorate degree rate", "6%", "2.30%", "1%", "8.20%"…
## $ total5           <chr> "total", "144604.2", "12252.56", "4527.6", "22337.62"…

Remove tittle rows in data frame.

tidycollege <- tail(college_data, -1) %>%
  select(-c(1))

head(college_data)

##               race total_graduates Percentage           5Y_Grad_rate
## 1                                             5 year graduation rate
## 2        Caucasian       2,410,070     59.10%                 62.20%
## 3         Hispanic         532,720       13.1                 41.50%
## 4 African American         452,760     11.10%                 40.50%
## 5    Non-Residents         272,410      6.70%                 10.15%
## 6            Asian         211,459      6.60%                 69.30%
##         total1          assos_dg_rate    total2      bachelor_dg_rate
## 1 total_number associates degree rate     total bachelors degree rate
## 2   1499063.54                     7%  168704.9                   51%
## 3     221078.8                 40.10% 213620.72                45.30%
## 4     183367.8                     3%   13582.8                    2%
## 5    27649.615                     8%   21792.8                32.00%
## 6   146541.087                 21.70% 45886.60%                53.10%
##       total3       Master_dg_rate    total4           PHD_dg_rate    total5
## 1      total master's degree rate     total doctorate degree rate     total
## 2  1229135.7                   7%  168704.9                    6%  144604.2
## 3  241322.16               12.30%  65524.56                 2.30%  12252.56
## 4     9055.2                   5%     22638                    1%    4527.6
## 5    87171.2               52.10% 141925.61                 8.20%  22337.62
## 6 112284.73%               17.70% 37428.24%                 7.50% 15859.43%

tidycollege$Item

## NULL

Remove commas, and change values from string to numerical.

college_data[total_graduates] <- college_data[total_graduates] %>% apply(MARGIN = 2, FUN = function(x) as.numeric(str_remove(x,",")))

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

print(total_graduates)

##  [1] "total_graduates"  "Percentage"       "5Y_Grad_rate"     "total1"          
##  [5] "assos_dg_rate"    "total2"           "bachelor_dg_rate" "total3"          
##  [9] "Master_dg_rate"   "total4"           "PHD_dg_rate"      "total5"

Change format of data to long format

l_data <- college_data %>% pivot_longer(cols=total_graduates,names_to = "Comparison", values_to = "total_Comp")
print(l_data)

## # A tibble: 8 × 14
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 7 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>

l_data <- l_data %>% mutate(Race = ifelse(race == "", NA, race))

print(l_data)

## # A tibble: 8 × 15
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## #   Race <chr>

l_data <- l_data %>% fill(race)
print(l_data)

## # A tibble: 8 × 15
##   race  Percentage `5Y_Grad_rate`  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>      <dbl>          <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""          NA               NA NA                 NA     NA                NA
## 2 "Cau…       NA               NA  1.50e6            NA 168705.               NA
## 3 "His…       13.1             NA  2.21e5            NA 213621.               NA
## 4 "Afr…       NA               NA  1.83e5            NA  13583.               NA
## 5 "Non…       NA               NA  2.76e4            NA  21793.               NA
## 6 "Asi…       NA               NA  1.47e5            NA     NA                NA
## 7 "Mul…       NA               NA  5.86e4            NA     NA                NA
## 8 "Ame…       NA               NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## #   PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## #   Race <chr>

l_data <- l_data %>% janitor::clean_names()

print(l_data)

## # A tibble: 8 × 15
##   race   percentage x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate
##   <chr>       <dbl>         <dbl>   <dbl>         <dbl>   <dbl>            <dbl>
## 1 ""           NA              NA NA                 NA     NA                NA
## 2 "Cauc…       NA              NA  1.50e6            NA 168705.               NA
## 3 "Hisp…       13.1            NA  2.21e5            NA 213621.               NA
## 4 "Afri…       NA              NA  1.83e5            NA  13583.               NA
## 5 "Non-…       NA              NA  2.76e4            NA  21793.               NA
## 6 "Asia…       NA              NA  1.47e5            NA     NA                NA
## 7 "Mult…       NA              NA  5.86e4            NA     NA                NA
## 8 "Amer…       NA              NA  9.75e3            NA   2234.               NA
## # ℹ 8 more variables: total3 <dbl>, master_dg_rate <dbl>, total4 <dbl>,
## #   phd_dg_rate <dbl>, total5 <dbl>, comparison <chr>, total_comp <dbl>,
## #   race_2 <chr>

t_data <- l_data %>% spread(key=percentage, value=comparison)

print(t_data)

## # A tibble: 8 × 15
##   race      x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate  total3
##   <chr>             <dbl>   <dbl>         <dbl>   <dbl>            <dbl>   <dbl>
## 1 ""                   NA NA                 NA     NA                NA NA     
## 2 "African…            NA  1.83e5            NA  13583.               NA  9.06e3
## 3 "America…            NA  9.75e3            NA   2234.               NA  2.23e3
## 4 "Asian"              NA  1.47e5            NA     NA                NA NA     
## 5 "Caucasi…            NA  1.50e6            NA 168705.               NA  1.23e6
## 6 "Hispani…            NA  2.21e5            NA 213621.               NA  2.41e5
## 7 "Multira…            NA  5.86e4            NA     NA                NA NA     
## 8 "Non-Res…            NA  2.76e4            NA  21793.               NA  8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## #   total5 <dbl>, total_comp <dbl>, race_2 <chr>, `13.1` <chr>, `<NA>` <chr>

t_data <- t_data %>% janitor::clean_names()

print(t_data)

## # A tibble: 8 × 15
##   race      x5y_grad_rate  total1 assos_dg_rate  total2 bachelor_dg_rate  total3
##   <chr>             <dbl>   <dbl>         <dbl>   <dbl>            <dbl>   <dbl>
## 1 ""                   NA NA                 NA     NA                NA NA     
## 2 "African…            NA  1.83e5            NA  13583.               NA  9.06e3
## 3 "America…            NA  9.75e3            NA   2234.               NA  2.23e3
## 4 "Asian"              NA  1.47e5            NA     NA                NA NA     
## 5 "Caucasi…            NA  1.50e6            NA 168705.               NA  1.23e6
## 6 "Hispani…            NA  2.21e5            NA 213621.               NA  2.41e5
## 7 "Multira…            NA  5.86e4            NA     NA                NA NA     
## 8 "Non-Res…            NA  2.76e4            NA  21793.               NA  8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## #   total5 <dbl>, total_comp <dbl>, race_2 <chr>, x13_1 <chr>, na <chr>

library(ggplot2)
ggplot(college_data) +
  geom_point(mapping = aes(x=race, y=total_graduates))

## Warning: Removed 2 rows containing missing values (`geom_point()`).

project 2

Victor Torres

2024-03-02

Project 2

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work.

I going tol get the data from a .CSV file that I created.

Load the libraries required for the project.

Name columns on data frame.

Remove tittle rows in data frame.

Remove commas, and change values from string to numerical.

Change format of data to long format