Use the read.csv() function to read the data from the
gates_donations.csv file (available on Canvas, make sure
you save it in the project repository folder) into a variable called
grants
## if you save it in the same folder as this file this should work.
## But you may need edit the string to reflect the actual file name:
grants <- read_csv("gates_donations.csv")
## Rows: 255 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): grant_title, organization, group, Grant start date
## dbl (5): id, total_amount, start_month, start_day, start_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grants %>% head()
## # A tibble: 6 × 9
## grant_title id organization total_amount group `Grant start date`
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 New Mexico Business … 1 New Mexico … 5000 low 2/4/10
## 2 LA NSC Match 2 Trustees of… 27727 low 8/3/09
## 3 Mathematics Assessme… 3 Denver Scho… 36018 low 11/12/09
## 4 Convening of Stakeho… 4 The NEA Fou… 38420 low 3/11/10
## 5 Conference Support 5 New Schools… 50000 low 10/12/09
## 6 Conference Support G… 6 Battelle Fo… 50000 low 6/30/09
## # ℹ 3 more variables: start_month <dbl>, start_day <dbl>, start_year <dbl>
#head(grants)
Because you’re in an RProject, you don’t need to set your working
directory in RStudio! The working directory is automatically set to the
repository location. (You can use getwd() to confirm
this)
grants %>% glimpse()
## Rows: 255
## Columns: 9
## $ grant_title <chr> "New Mexico Business Roundtable", "LA NSC Match", "…
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ organization <chr> "New Mexico Business Roundtable for Educational Exc…
## $ total_amount <dbl> 5000, 27727, 36018, 38420, 50000, 50000, 50000, 515…
## $ group <chr> "low", "low", "low", "low", "low", "low", "low", "l…
## $ `Grant start date` <chr> "2/4/10", "8/3/09", "11/12/09", "3/11/10", "10/12/0…
## $ start_month <dbl> 2, 8, 11, 3, 10, 6, 9, 10, 11, 10, 11, 11, 11, 11, …
## $ start_day <dbl> 4, 3, 12, 11, 12, 30, 24, 29, 16, 28, 2, 2, 2, 2, 2…
## $ start_year <dbl> 2010, 2009, 2009, 2010, 2009, 2009, 2009, 2009, 200…
#glimpse(grants)
org that contains the
organization column of the datasetorg <- grants$organization
#glimpse(org)
#print(org)
is.vector() function. (This is a useful debugging tip if
you hit errors later!)#print(is.vector(org))
org %>% is.vector()
## [1] TRUE
Now you can ask some interesting questions about the dataset:
#mean_grant <-mean(grants$total_amount)
mean_grant <- grants$total_amount %>% mean()
print(mean_grant)
## [1] 2600197
#highest_grant <- max(grants$total_amount)
highest_grant <- grants$total_amount %>% max()
print(highest_grant)
## [1] 1e+08
smallest_grant <- min(grants$total_amount)
smallest_grant <- grants$total_amount %>% min()
print(smallest_grant)
## [1] 5000
#largest_org <- grants$organization[which.max(grants$total_amount)]
largest_org <- filter(grants, total_amount == max(total_amount))
print(largest_org$organization)
## [1] "Hillsborough County Public Schools"
#smallest_org <- grants$organization[which.min(grants$total_amount)]
smallest_org <- filter(grants, total_amount == min(total_amount))
print(smallest_org$organization)
## [1] "New Mexico Business Roundtable for Educational Excellence"
#glimpse(grants)
filter(grants,start_year == 2010) %>% nrow()
## [1] 18
library(dplyr)
grant_2010 <- grants %>%
filter(grants$start_year == 2010)
#total_2010 = sum(grant_2010$total_amount)
total_2010 <- grant_2010$total_amount %>% sum()
print(total_2010)
## [1] 29196031
Load R’s “USPersonalExpenditure” dataset using the
data() function
This will produce a data frame called
USPersonalExpenditure
data("USPersonalExpenditure")
#head(USPersonalExpenditure)
USPersonalExpenditure %>% head()
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
#is.data.frame(USPersonalExpenditure)
USPersonalExpenditure %>% is.data.frame()
## [1] FALSE
The variable USPersonalExpenditure is now accessible to
you. Unfortunately, it’s not a data frame (it’s actually a matrix, you
can find that out using)
Test this using the is.data.frame() function:
is.data.frame(USPersonalExpenditure)
## [1] FALSE
Luckily, you can pass the USPersonalExpenditure variable as an
argument to the data.frame() function to convert it a data
frame.
Do this, storing the result in a new variable
US_Expenditure <- data.frame(USPersonalExpenditure)
head(US_Expenditure)
## X1940 X1945 X1950 X1955 X1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
is.data.frame(US_Expenditure)
## [1] TRUE
What are the current column names of your dataframe?
colnames(US_Expenditure)
## [1] "X1940" "X1945" "X1950" "X1955" "X1960"
Consider: why are they so strange? (Think about whether you could use a number like 1940 with dollar notation!)
What are the current row names of your dataframe?
row.names(US_Expenditure)
## [1] "Food and Tobacco" "Household Operation" "Medical and Health"
## [4] "Personal Care" "Private Education"
Add a column “Category” to your data frame that contains the rownames
US_Expenditure$Category <- c(row.names(US_Expenditure))
head(US_Expenditure,10)
## X1940 X1945 X1950 X1955 X1960 Category
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80 Food and Tobacco
## Household Operation 10.500 15.500 29.00 36.5 46.20 Household Operation
## Medical and Health 3.530 5.760 9.71 14.0 21.10 Medical and Health
## Personal Care 1.040 1.980 2.45 3.4 5.40 Personal Care
## Private Education 0.341 0.974 1.80 2.6 3.64 Private Education
Consider how this data isn’t tidy. Use the
pivot_longer() function from last class to make the data
tidy.
new_df <- pivot_longer(US_Expenditure,1:5, names_to = "Year", values_to = "Expense")
new_df$Year <-as.numeric(sub("X","",new_df$Year))
head(new_df,100)
## # A tibble: 25 × 3
## Category Year Expense
## <chr> <dbl> <dbl>
## 1 Food and Tobacco 1940 22.2
## 2 Food and Tobacco 1945 44.5
## 3 Food and Tobacco 1950 59.6
## 4 Food and Tobacco 1955 73.2
## 5 Food and Tobacco 1960 86.8
## 6 Household Operation 1940 10.5
## 7 Household Operation 1945 15.5
## 8 Household Operation 1950 29
## 9 Household Operation 1955 36.5
## 10 Household Operation 1960 46.2
## # ℹ 15 more rows
How much money was spent on personal care in 1940?
#personal_1940 <- new_df %>%
# filter(new_df$Category =="Personal Care" & new_df$Year == "1940")
personal_1940 <- filter(new_df, Category =="Personal Care" & Year == 1940)
print(personal_1940$Expense)
## [1] 1.04
How much money was spent on Food and Tobacco in 1960?
#food_tobacco_1960 <- new_df %>%
# filter(new_df$Category == "Food and Tobacco" & new_df$Year =="1960")
food_tobacco_1960 <- filter(new_df, Category =="Food and Tobacco" & Year== 1960)
head(food_tobacco_1960$Expense)
## [1] 86.8
What was the highest expenditure category in 1960?
#spend_1960 <- new_df %>%
# filter(new_df$Year =="1960")
spend_1960 <- filter(new_df, Year == 1960)
#head(spend_1960)
#highest_spend_1960 <- spend_1960$Expense[which.max(spend_1960$Expense)]
highest_spend_1960 <- filter(spend_1960,Expense == max(Expense))
print(highest_spend_1960$Expense)
## [1] 86.8
Define a function lowest_category that takes in a year
as a parameter, and returns the lowest spending category of that
year
lowest_category <- function(input_year) {
year_filter <- new_df %>% filter(Year ==input_year) %>% filter(Expense == min(Expense))
##lowest_expense <- filter(new_df, Year == input_year & Expense == min(Expense))
#lowest_spend <- year_filter$Category[which.min(year_filter$Expense)]
##print(lowest_expense$Category)
#return(lowest_spend)
return(year_filter$Category)
}
lowest_category(1960)
## [1] "Private Education"
Using your function, determine the lowest spending category of each
year Hint: use the sapply() function to apply your function
to a vector of years
list_year <- unique(new_df$Year)
sapply(list_year, lowest_category)
## [1] "Private Education" "Private Education" "Private Education"
## [4] "Private Education" "Private Education"