The goal of this assignment is to give you practice in preparing
different datasets for downstream analysis work.
I going tol get the data from a .CSV file that I created.
college_data<-read.csv('https://raw.githubusercontent.com/vitugo23/DATA607/main/project2/grad_rates_data.csv')
dim(college_data)
## [1] 8 13
head(college_data)
## X X.1 X.2 X.3 X.4
## 1 5 year graduation rate total_number
## 2 Caucasian 2,410,070 59.10% 62.20% 1499063.54
## 3 Hispanic 532,720 13.1 41.50% 221078.8
## 4 African American 452,760 11.10% 40.50% 183367.8
## 5 Non-Residents 272,410 6.70% 10.15% 27649.615
## 6 Asian 211,459 6.60% 69.30% 146541.087
## X.5 Number.of.College.Graduates.by.Race.or.Ethnicity.2024
## 1 associates degree rate total
## 2 7% 168704.9
## 3 40.10% 213620.72
## 4 3% 13582.8
## 5 8% 21792.8
## 6 21.70% 45886.60%
## X.6 X.7 X.8 X.9
## 1 bachelors degree rate total master's degree rate total
## 2 51% 1229135.7 7% 168704.9
## 3 45.30% 241322.16 12.30% 65524.56
## 4 2% 9055.2 5% 22638
## 5 32.00% 87171.2 52.10% 141925.61
## 6 53.10% 112284.73% 17.70% 37428.24%
## X.10 X.11
## 1 doctorate degree rate total
## 2 6% 144604.2
## 3 2.30% 12252.56
## 4 1% 4527.6
## 5 8.20% 22337.62
## 6 7.50% 15859.43%
tail(college_data)
## X X.1 X.2 X.3 X.4 X.5
## 3 Hispanic 532,720 13.1 41.50% 221078.8 40.10%
## 4 African American 452,760 11.10% 40.50% 183367.8 3%
## 5 Non-Residents 272,410 6.70% 10.15% 27649.615 8%
## 6 Asian 211,459 6.60% 69.30% 146541.087 21.70%
## 7 Multiracial 116,350 2.90% 50.40% 58640.4 27.40%
## 8 American Indian 24,820 0.60% 39.30% 9754.26 9%
## Number.of.College.Graduates.by.Race.or.Ethnicity.2024 X.6 X.7
## 3 213620.72 45.30% 241322.16
## 4 13582.8 2% 9055.2
## 5 21792.8 32.00% 87171.2
## 6 45886.60% 53.10% 112284.73%
## 7 31879.90% 54.50% 63410.75%
## 8 2233.8 9% 2233.8
## X.8 X.9 X.10 X.11
## 3 12.30% 65524.56 2.30% 12252.56
## 4 5% 22638 1% 4527.6
## 5 52.10% 141925.61 8.20% 22337.62
## 6 17.70% 37428.24% 7.50% 15859.43%
## 7 14.80% 17219.80% 3.30% 3839.55%
## 8 5% 1241 7% 1737.4
Load the libraries required for the project.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Name columns on data frame.
colnames(college_data)[1] = "race"
colnames(college_data)[2] = "total_graduates"
colnames(college_data)[3] = "Percentage"
colnames(college_data)[4] = "5Y_Grad_rate"
colnames(college_data)[5] = "total1"
colnames(college_data)[6] = "assos_dg_rate"
colnames(college_data)[7] = "total2"
colnames(college_data)[8] = "bachelor_dg_rate"
colnames(college_data)[9] = "total3"
colnames(college_data)[10] = "Master_dg_rate"
colnames(college_data)[11] = "total4"
colnames(college_data)[12] = "PHD_dg_rate"
colnames(college_data)[13] = "total5"
total_graduates <- colnames(college_data[2:length(college_data)])
glimpse(college_data)
## Rows: 8
## Columns: 13
## $ race <chr> "", "Caucasian", "Hispanic", "African American", "Non…
## $ total_graduates <chr> "", "2,410,070", "532,720", "452,760", "272,410", "21…
## $ Percentage <chr> "", "59.10%", "13.1", "11.10%", "6.70%", "6.60%", "2.…
## $ `5Y_Grad_rate` <chr> "5 year graduation rate", "62.20%", "41.50%", "40.50%…
## $ total1 <chr> "total_number", "1499063.54", "221078.8", "183367.8",…
## $ assos_dg_rate <chr> "associates degree rate", "7%", "40.10%", "3%", "8%",…
## $ total2 <chr> "total", "168704.9", "213620.72", "13582.8", "21792.8…
## $ bachelor_dg_rate <chr> "bachelors degree rate", "51%", "45.30%", "2%", "32.0…
## $ total3 <chr> "total", "1229135.7", "241322.16", "9055.2", "87171.2…
## $ Master_dg_rate <chr> "master's degree rate", "7%", "12.30%", "5%", "52.10%…
## $ total4 <chr> "total", "168704.9", "65524.56", "22638", "141925.61"…
## $ PHD_dg_rate <chr> "doctorate degree rate", "6%", "2.30%", "1%", "8.20%"…
## $ total5 <chr> "total", "144604.2", "12252.56", "4527.6", "22337.62"…
Remove tittle rows in data frame.
tidycollege <- tail(college_data, -1) %>%
select(-c(1))
head(college_data)
## race total_graduates Percentage 5Y_Grad_rate
## 1 5 year graduation rate
## 2 Caucasian 2,410,070 59.10% 62.20%
## 3 Hispanic 532,720 13.1 41.50%
## 4 African American 452,760 11.10% 40.50%
## 5 Non-Residents 272,410 6.70% 10.15%
## 6 Asian 211,459 6.60% 69.30%
## total1 assos_dg_rate total2 bachelor_dg_rate
## 1 total_number associates degree rate total bachelors degree rate
## 2 1499063.54 7% 168704.9 51%
## 3 221078.8 40.10% 213620.72 45.30%
## 4 183367.8 3% 13582.8 2%
## 5 27649.615 8% 21792.8 32.00%
## 6 146541.087 21.70% 45886.60% 53.10%
## total3 Master_dg_rate total4 PHD_dg_rate total5
## 1 total master's degree rate total doctorate degree rate total
## 2 1229135.7 7% 168704.9 6% 144604.2
## 3 241322.16 12.30% 65524.56 2.30% 12252.56
## 4 9055.2 5% 22638 1% 4527.6
## 5 87171.2 52.10% 141925.61 8.20% 22337.62
## 6 112284.73% 17.70% 37428.24% 7.50% 15859.43%
tidycollege$Item
## NULL
Remove commas, and change values from string to numerical.
college_data[total_graduates] <- college_data[total_graduates] %>% apply(MARGIN = 2, FUN = function(x) as.numeric(str_remove(x,",")))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
print(total_graduates)
## [1] "total_graduates" "Percentage" "5Y_Grad_rate" "total1"
## [5] "assos_dg_rate" "total2" "bachelor_dg_rate" "total3"
## [9] "Master_dg_rate" "total4" "PHD_dg_rate" "total5"
Change format of data to long format
l_data <- college_data %>% pivot_longer(cols=total_graduates,names_to = "Comparison", values_to = "total_Comp")
print(l_data)
## # A tibble: 8 × 14
## race Percentage `5Y_Grad_rate` total1 assos_dg_rate total2 bachelor_dg_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "Cau… NA NA 1.50e6 NA 168705. NA
## 3 "His… 13.1 NA 2.21e5 NA 213621. NA
## 4 "Afr… NA NA 1.83e5 NA 13583. NA
## 5 "Non… NA NA 2.76e4 NA 21793. NA
## 6 "Asi… NA NA 1.47e5 NA NA NA
## 7 "Mul… NA NA 5.86e4 NA NA NA
## 8 "Ame… NA NA 9.75e3 NA 2234. NA
## # ℹ 7 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## # PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>
l_data <- l_data %>% mutate(Race = ifelse(race == "", NA, race))
print(l_data)
## # A tibble: 8 × 15
## race Percentage `5Y_Grad_rate` total1 assos_dg_rate total2 bachelor_dg_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "Cau… NA NA 1.50e6 NA 168705. NA
## 3 "His… 13.1 NA 2.21e5 NA 213621. NA
## 4 "Afr… NA NA 1.83e5 NA 13583. NA
## 5 "Non… NA NA 2.76e4 NA 21793. NA
## 6 "Asi… NA NA 1.47e5 NA NA NA
## 7 "Mul… NA NA 5.86e4 NA NA NA
## 8 "Ame… NA NA 9.75e3 NA 2234. NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## # PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## # Race <chr>
l_data <- l_data %>% fill(race)
print(l_data)
## # A tibble: 8 × 15
## race Percentage `5Y_Grad_rate` total1 assos_dg_rate total2 bachelor_dg_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "Cau… NA NA 1.50e6 NA 168705. NA
## 3 "His… 13.1 NA 2.21e5 NA 213621. NA
## 4 "Afr… NA NA 1.83e5 NA 13583. NA
## 5 "Non… NA NA 2.76e4 NA 21793. NA
## 6 "Asi… NA NA 1.47e5 NA NA NA
## 7 "Mul… NA NA 5.86e4 NA NA NA
## 8 "Ame… NA NA 9.75e3 NA 2234. NA
## # ℹ 8 more variables: total3 <dbl>, Master_dg_rate <dbl>, total4 <dbl>,
## # PHD_dg_rate <dbl>, total5 <dbl>, Comparison <chr>, total_Comp <dbl>,
## # Race <chr>
l_data <- l_data %>% janitor::clean_names()
print(l_data)
## # A tibble: 8 × 15
## race percentage x5y_grad_rate total1 assos_dg_rate total2 bachelor_dg_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "Cauc… NA NA 1.50e6 NA 168705. NA
## 3 "Hisp… 13.1 NA 2.21e5 NA 213621. NA
## 4 "Afri… NA NA 1.83e5 NA 13583. NA
## 5 "Non-… NA NA 2.76e4 NA 21793. NA
## 6 "Asia… NA NA 1.47e5 NA NA NA
## 7 "Mult… NA NA 5.86e4 NA NA NA
## 8 "Amer… NA NA 9.75e3 NA 2234. NA
## # ℹ 8 more variables: total3 <dbl>, master_dg_rate <dbl>, total4 <dbl>,
## # phd_dg_rate <dbl>, total5 <dbl>, comparison <chr>, total_comp <dbl>,
## # race_2 <chr>
t_data <- l_data %>% spread(key=percentage, value=comparison)
print(t_data)
## # A tibble: 8 × 15
## race x5y_grad_rate total1 assos_dg_rate total2 bachelor_dg_rate total3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "African… NA 1.83e5 NA 13583. NA 9.06e3
## 3 "America… NA 9.75e3 NA 2234. NA 2.23e3
## 4 "Asian" NA 1.47e5 NA NA NA NA
## 5 "Caucasi… NA 1.50e6 NA 168705. NA 1.23e6
## 6 "Hispani… NA 2.21e5 NA 213621. NA 2.41e5
## 7 "Multira… NA 5.86e4 NA NA NA NA
## 8 "Non-Res… NA 2.76e4 NA 21793. NA 8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## # total5 <dbl>, total_comp <dbl>, race_2 <chr>, `13.1` <chr>, `<NA>` <chr>
t_data <- t_data %>% janitor::clean_names()
print(t_data)
## # A tibble: 8 × 15
## race x5y_grad_rate total1 assos_dg_rate total2 bachelor_dg_rate total3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "" NA NA NA NA NA NA
## 2 "African… NA 1.83e5 NA 13583. NA 9.06e3
## 3 "America… NA 9.75e3 NA 2234. NA 2.23e3
## 4 "Asian" NA 1.47e5 NA NA NA NA
## 5 "Caucasi… NA 1.50e6 NA 168705. NA 1.23e6
## 6 "Hispani… NA 2.21e5 NA 213621. NA 2.41e5
## 7 "Multira… NA 5.86e4 NA NA NA NA
## 8 "Non-Res… NA 2.76e4 NA 21793. NA 8.72e4
## # ℹ 8 more variables: master_dg_rate <dbl>, total4 <dbl>, phd_dg_rate <dbl>,
## # total5 <dbl>, total_comp <dbl>, race_2 <chr>, x13_1 <chr>, na <chr>
library(ggplot2)
ggplot(college_data) +
geom_point(mapping = aes(x=race, y=total_graduates))
## Warning: Removed 2 rows containing missing values (`geom_point()`).
