Load the packages

#install.packages("tidyverse")
#install.packages("dslabs")  # these are data science labs
#install.packages("treemap")
#install.packages("RColorBrewer")
#install.packages("ggplot2")



library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dslabs)
library(treemap)
library(RColorBrewer)
library(ggplot2)

View the different datasets from dslabs

data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
##  [1] "make-admissions.R"                   
##  [2] "make-brca.R"                         
##  [3] "make-brexit_polls.R"                 
##  [4] "make-death_prob.R"                   
##  [5] "make-divorce_margarine.R"            
##  [6] "make-gapminder-rdas.R"               
##  [7] "make-greenhouse_gases.R"             
##  [8] "make-historic_co2.R"                 
##  [9] "make-mnist_27.R"                     
## [10] "make-movielens.R"                    
## [11] "make-murders-rda.R"                  
## [12] "make-na_example-rda.R"               
## [13] "make-nyc_regents_scores.R"           
## [14] "make-olive.R"                        
## [15] "make-outlier_example.R"              
## [16] "make-polls_2008.R"                   
## [17] "make-polls_us_election_2016.R"       
## [18] "make-reported_heights-rda.R"         
## [19] "make-research_funding_rates.R"       
## [20] "make-stars.R"                        
## [21] "make-temp_carbon.R"                  
## [22] "make-tissue-gene-expression.R"       
## [23] "make-trump_tweets.R"                 
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"

Choose which dataset to use

data("research_funding_rates")

# save the research_funding_rates dataset to your folder using the write_csv command

write_csv(research_funding_rates, "research_funding_rates.csv", na="")

View the top six columns in the dataset

head(research_funding_rates)
##           discipline applications_total applications_men applications_women
## 1  Chemical sciences                122               83                 39
## 2  Physical sciences                174              135                 39
## 3            Physics                 76               67                  9
## 4         Humanities                396              230                166
## 5 Technical sciences                251              189                 62
## 6  Interdisciplinary                183              105                 78
##   awards_total awards_men awards_women success_rates_total success_rates_men
## 1           32         22           10                26.2              26.5
## 2           35         26            9                20.1              19.3
## 3           20         18            2                26.3              26.9
## 4           65         33           32                16.4              14.3
## 5           43         30           13                17.1              15.9
## 6           29         12           17                15.8              11.4
##   success_rates_women
## 1                25.6
## 2                23.1
## 3                22.2
## 4                19.3
## 5                21.0
## 6                21.8

Create a treemap of the dataset

 treemap(research_funding_rates, index="discipline", vSize="applications_total",
              vColor="success_rates_total", type="value",
              palette="Pastel1",
         
              title = "Funding rates by disiciple") 

As shown social science has the largest section because it received the most applications. Physics had the highest funding rate but lowest number of applicants.

Create a point plot of the dataset

funding_plot <- research_funding_rates %>%
  ggplot() + 
  geom_point(aes(x=applications_total,y=success_rates_total, group=discipline, color=discipline)) +
              xlab("Number of applications") +
              ylab(" Application success rate ") +
       ggtitle("Funding rate by discipline") 

funding_plot + theme_dark()

This point plot shows the application funding success rate colored by discipline. It makes sense that the more applications the lower the success rate.