{r} # usethis::use_vignette("Week_9_TidyVerse_GitHub_CREATE_assignment") #
Your task here is to Create an Example. Using one or more TidyVerse packages, and any dataset from fivethirtyeight.com or Kaggle, create a programming sample “vignette” that demonstrates how to use one or more of the capabilities of the selected TidyVerse package with your selected dataset. (25 points)
Later, you’ll be asked to extend an existing vignette. Using one of your classmate’s examples (as created above), you’ll then extend his or her example with additional annotated code. (15 points)
You should clone the provided repository. Once you have code to submit, you should make a pull request on the shared repository. You should also update the README.md file with your example.
After you’ve created your vignette, please submit your GitHub handle name in the submission link provided below. This will let your instructor know that your work is ready to be peer-graded.
You should complete your submission on the schedule stated in the course syllabus. #
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.4.2
#> Warning: package 'ggplot2' was built under R version 4.4.2
#> Warning: package 'stringr' was built under R version 4.4.2
#> Warning: package 'lubridate' was built under R version 4.4.2
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
#> ✔ readr 2.1.5
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
T he data source are kaggle website. https://www.kaggle.com/datasets/tunguz/college-majors
dataset <- read_csv("https://raw.githubusercontent.com/asadny82/Data607/refs/heads/main/Week%209_TidyVerse_GitHub_CREATE%20assignment.csv")
#> Rows: 173 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (2): Major, Major_category
#> dbl (9): Major_code, Total, Employed, Employed_full_time_year_round, Unemplo...
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset
#> # A tibble: 173 × 11
#> Major_code Major Major_category Total Employed Employed_full_time_y…¹
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 1100 GENERAL AGR… Agriculture &… 128148 90245 74078
#> 2 1101 AGRICULTURE… Agriculture &… 95326 76865 64240
#> 3 1102 AGRICULTURA… Agriculture &… 33955 26321 22810
#> 4 1103 ANIMAL SCIE… Agriculture &… 103549 81177 64937
#> 5 1104 FOOD SCIENCE Agriculture &… 24280 17281 12722
#> 6 1105 PLANT SCIEN… Agriculture &… 79409 63043 51077
#> 7 1106 SOIL SCIENCE Agriculture &… 6586 4926 4042
#> 8 1199 MISCELLANEO… Agriculture &… 8549 6392 5074
#> 9 1301 ENVIRONMENT… Biology & Lif… 106106 87602 65238
#> 10 1302 FORESTRY Agriculture &… 69447 48228 39613
#> # ℹ 163 more rows
#> # ℹ abbreviated name: ¹Employed_full_time_year_round
#> # ℹ 5 more variables: Unemployed <dbl>, Unemployment_rate <dbl>, Median <dbl>,
#> # P25th <dbl>, P75th <dbl>
The tibble function use for better visualization of the dataset, and present it as a table.
as_tibble(dataset)
#> # A tibble: 173 × 11
#> Major_code Major Major_category Total Employed Employed_full_time_y…¹
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 1100 GENERAL AGR… Agriculture &… 128148 90245 74078
#> 2 1101 AGRICULTURE… Agriculture &… 95326 76865 64240
#> 3 1102 AGRICULTURA… Agriculture &… 33955 26321 22810
#> 4 1103 ANIMAL SCIE… Agriculture &… 103549 81177 64937
#> 5 1104 FOOD SCIENCE Agriculture &… 24280 17281 12722
#> 6 1105 PLANT SCIEN… Agriculture &… 79409 63043 51077
#> 7 1106 SOIL SCIENCE Agriculture &… 6586 4926 4042
#> 8 1199 MISCELLANEO… Agriculture &… 8549 6392 5074
#> 9 1301 ENVIRONMENT… Biology & Lif… 106106 87602 65238
#> 10 1302 FORESTRY Agriculture &… 69447 48228 39613
#> # ℹ 163 more rows
#> # ℹ abbreviated name: ¹Employed_full_time_year_round
#> # ℹ 5 more variables: Unemployed <dbl>, Unemployment_rate <dbl>, Median <dbl>,
#> # P25th <dbl>, P75th <dbl>
glimpse(dataset)
#> Rows: 173
#> Columns: 11
#> $ Major_code <dbl> 1100, 1101, 1102, 1103, 1104, 1105, 1106…
#> $ Major <chr> "GENERAL AGRICULTURE", "AGRICULTURE PROD…
#> $ Major_category <chr> "Agriculture & Natural Resources", "Agri…
#> $ Total <dbl> 128148, 95326, 33955, 103549, 24280, 794…
#> $ Employed <dbl> 90245, 76865, 26321, 81177, 17281, 63043…
#> $ Employed_full_time_year_round <dbl> 74078, 64240, 22810, 64937, 12722, 51077…
#> $ Unemployed <dbl> 2423, 2266, 821, 3619, 894, 2070, 264, 2…
#> $ Unemployment_rate <dbl> 0.02614711, 0.02863606, 0.03024832, 0.04…
#> $ Median <dbl> 50000, 54000, 63000, 46000, 62000, 50000…
#> $ P25th <dbl> 34000, 36000, 40000, 30000, 38500, 35000…
#> $ P75th <dbl> 80000, 80000, 98000, 72000, 90000, 75000…
colnames(dataset)
#> [1] "Major_code" "Major"
#> [3] "Major_category" "Total"
#> [5] "Employed" "Employed_full_time_year_round"
#> [7] "Unemployed" "Unemployment_rate"
#> [9] "Median" "P25th"
#> [11] "P75th"
#select the columns from dataset
dataset <- dataset |> select( Major, Unemployed, Employed_full_time_year_round, Unemployment_rate,Major_category)
tibble(dataset)
#> # A tibble: 173 × 5
#> Major Unemployed Employed_full_time_y…¹ Unemployment_rate Major_category
#> <chr> <dbl> <dbl> <dbl> <chr>
#> 1 GENERAL A… 2423 74078 0.0261 Agriculture &…
#> 2 AGRICULTU… 2266 64240 0.0286 Agriculture &…
#> 3 AGRICULTU… 821 22810 0.0302 Agriculture &…
#> 4 ANIMAL SC… 3619 64937 0.0427 Agriculture &…
#> 5 FOOD SCIE… 894 12722 0.0492 Agriculture &…
#> 6 PLANT SCI… 2070 51077 0.0318 Agriculture &…
#> 7 SOIL SCIE… 264 4042 0.0509 Agriculture &…
#> 8 MISCELLAN… 261 5074 0.0392 Agriculture &…
#> 9 ENVIRONME… 4736 65238 0.0513 Biology & Lif…
#> 10 FORESTRY 2144 39613 0.0426 Agriculture &…
#> # ℹ 163 more rows
#> # ℹ abbreviated name: ¹Employed_full_time_year_round
#run the quire group by to check the data is tidy
dataset1 <- dataset %>%
group_by(Unemployment_rate)%>%
summarise(mean_Unemployment_rate = mean(Unemployment_rate))
dataset1
#> # A tibble: 172 × 2
#> Unemployment_rate mean_Unemployment_rate
#> <dbl> <dbl>
#> 1 0 0
#> 2 0.0161 0.0161
#> 3 0.0223 0.0223
#> 4 0.0249 0.0249
#> 5 0.0261 0.0261
#> 6 0.0263 0.0263
#> 7 0.0268 0.0268
#> 8 0.0286 0.0286
#> 9 0.0302 0.0302
#> 10 0.0314 0.0314
#> # ℹ 162 more rows
dataset <- dataset %>% arrange(desc(Unemployment_rate)) %>% head(20)
ggplot(dataset, aes(x = reorder(Major, Unemployment_rate), y = Unemployment_rate)) +
geom_bar(stat = "identity", fill = "red") +
labs(title = "Top 20 College Majors with \n highest Unemployment_rate",
x = "Major",
y = "Unemployment Rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
coord_flip()
#ptol for the Unemployment_rate by Major_category.
ggplot(dataset, aes(x = reorder(Major, Unemployment_rate), y = Major_category)) +
geom_bar(stat = "identity", fill = "red") +
labs(title = "Major_category of Majors",
x = "Major",
y = "Major_category") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 70, vjust = 0.5)) +
coord_flip()
#Top 20 Employed_full_time_year_round of the major. #BUSINESS MANAGEMENT AND ADMINISTRATION is the top.
dataset2 <-dataset %>% arrange(desc(Employed_full_time_year_round)) %>% head(20)
ggplot(dataset2, aes(x = reorder(Major, Employed_full_time_year_round), y = Employed_full_time_year_round)) +
geom_bar(stat = "identity", fill = "green", width = 0.5) +
labs(title = "Top 20 Number of Employed_full_time_year_round",
x = "Major",
y = "Employed_full_time_year_round") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 50, vjust = 2, hjust = 1)) +
coord_flip()