act_start <- us %>%
select(subagency_name, `_id`, activity_description, fiscal_year, activity_start_date) %>%
group_by(subagency_name, `_id`,activity_description,fiscal_year, activity_start_date) %>%
filter(is.na(activity_start_date)) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'subagency_name', '_id',
## 'activity_description', 'fiscal_year'. You can override using the `.groups`
## argument.
mytable <- xtabs(~fiscal_year + aid_type_group_name, data = us)
#to drop a column
us$subagency_acronym <- NULL
dim(us)
## [1] 8297 21
library(readr)
library(tidyverse)
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
library(Kendall)
## Warning: package 'Kendall' was built under R version 4.2.3
library(vroom)
## Warning: package 'vroom' was built under R version 4.2.3
##
## Attaching package: 'vroom'
##
## The following objects are masked from 'package:readr':
##
## as.col_spec, col_character, col_date, col_datetime, col_double,
## col_factor, col_guess, col_integer, col_logical, col_number,
## col_skip, col_time, cols, cols_condense, cols_only, date_names,
## date_names_lang, date_names_langs, default_locale, fwf_cols,
## fwf_empty, fwf_positions, fwf_widths, locale, output_column,
## problems, spec
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
us2 <- clean_names(us)
introduce(us)
## # A tibble: 1 × 9
## rows columns discrete_columns continuous_columns all_missing_columns
## <int> <int> <int> <int> <int>
## 1 8297 21 18 3 0
## # ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
## # total_observations <int>, memory_usage <dbl>
plot_missing(data = us)
plot_bar(data = us)
## 7 columns ignored with more than 50 categories.
## channel_name: 365 categories
## dac_purpose_name: 92 categories
## funding_account_name: 58 categories
## activity_name: 1166 categories
## activity_start_date: 303 categories
## activity_end_date: 245 categories
## activity_description: 1378 categories
colSums(us == "")
## _id implementing_agency_acronym
## 0 0
## implementing_agency_name subagency_name
## 0 0
## channel_category_name channel_subcategory_name
## 0 0
## channel_name dac_category_name
## 0 0
## dac_sector_name dac_purpose_name
## 0 0
## funding_account_name assistance_category_name
## 0 0
## activity_name activity_start_date
## 0 NA
## activity_end_date fiscal_year
## NA 0
## current_amount funding_agency_name
## 0 0
## funding_agency_acronym aid_type_group_name
## 0 0
## activity_description
## 0
missing_vars <- function(x) {
var <- 0
missing <- 0
missing_prop <- 0
for (i in 1:length(names(x))) {
var[i] <- names(x)[i]
missing[i] <- sum(is.na(x[, i]))
missing_prop[i] <- missing[i] / nrow(x)
}
(missing_data <- data.frame(var = var, missing = missing, missing_prop = missing_prop) %>%
arrange(desc(missing_prop)))
}
missing_vars(us2)
## var missing missing_prop
## 1 activity_start_date 5966 0.7190551
## 2 activity_end_date 5874 0.7079667
## 3 id 0 0.0000000
## 4 implementing_agency_acronym 0 0.0000000
## 5 implementing_agency_name 0 0.0000000
## 6 subagency_name 0 0.0000000
## 7 channel_category_name 0 0.0000000
## 8 channel_subcategory_name 0 0.0000000
## 9 channel_name 0 0.0000000
## 10 dac_category_name 0 0.0000000
## 11 dac_sector_name 0 0.0000000
## 12 dac_purpose_name 0 0.0000000
## 13 funding_account_name 0 0.0000000
## 14 assistance_category_name 0 0.0000000
## 15 activity_name 0 0.0000000
## 16 fiscal_year 0 0.0000000
## 17 current_amount 0 0.0000000
## 18 funding_agency_name 0 0.0000000
## 19 funding_agency_acronym 0 0.0000000
## 20 aid_type_group_name 0 0.0000000
## 21 activity_description 0 0.0000000
#two variables have 71 and 70% missing data
prop.table(table(us2$subagency_name))
##
## Agricultural Research Service
## 0.0007231529
## Animal, Plant and Health Inspection Service
## 0.0016873569
## Army Corp of Engineers
## 0.0003615765
## Bureau for Asia
## 0.0442328552
## Bureau for Asia and Near East
## 0.0007231529
## Bureau for Democracy, Conflict and Humanitarian Assistance
## 0.1447511149
## Bureau for Economic Growth, Agriculture and Trade
## 0.0010847294
## Bureau for Europe and Eurasia
## 0.0001205255
## Bureau for Global Health
## 0.0042183922
## Bureau for Middle East
## 0.0015668314
## Bureau for Policy, Planning, and Learning
## 0.0009642039
## Bureau of International Labor Affairs
## 0.0008436784
## Centers for Disease Control and Prevention
## 0.0138604315
## Counterterrorism Coordinator
## 0.0036157647
## Defense Security Cooperation Agency
## 0.0138604315
## Defense Threat Reduction Agency
## 0.0006026275
## Democracy, Human Rights, and Labor
## 0.0040978667
## Diplomatic Security
## 0.0021694588
## Drug Enforcement Administration
## 0.0003615765
## Educational and Cultural Affairs
## 0.0021694588
## Federal Bureau of Investigation
## 0.0010847294
## Food Safety Inspection Service
## 0.0001205255
## Foreign Agricultural Service
## 0.0081957334
## Global Development Lab
## 0.0001205255
## International Narcotics and Law Enforcement Affairs
## 0.0048210196
## International Security and Nonproliferation
## 0.0047004942
## International Trade Administration
## 0.0002410510
## National Endowment for Democracy
## 0.0451970592
## National Nuclear Security Administration
## 0.0038568157
## National Oceanic and Atmospheric Administration
## 0.0001205255
## not applicable
## 0.0128962276
## Office of Afghanistan and Pakistan Affairs
## 0.5723755574
## Office of Counternarcotics
## 0.0020489333
## Office of Development Partner
## 0.0001205255
## Office of General Counsel
## 0.0006026275
## Office of Technical Assistance
## 0.0013257804
## Office of the Secretary of Defense
## 0.0007231529
## Political-Military Affairs
## 0.0001205255
## Population, Refugees, and Migration
## 0.0597806436
## South and Central Asian Affairs
## 0.0100036158
## Trafficking in Persons
## 0.0010847294
## U.S. Customs and Border Protection
## 0.0004821020
## U.S. Fish and Wildlife Service
## 0.0008436784
## U.S. Immigration and Customs Enforcement
## 0.0003615765
## USAID Bureau - Other
## 0.0267566590
clean_names(us)
## # A tibble: 8,297 × 21
## id implementing_agency_acronym implementing_agency_name subagency_name
## <dbl> <chr> <chr> <chr>
## 1 1 USAID U.S. Agency for Internation… Bureau for As…
## 2 2 DOD Department of Defense Office of the…
## 3 3 DOD Department of Defense Office of the…
## 4 4 DOD Department of Defense Defense Secur…
## 5 5 DOD Department of Defense Defense Secur…
## 6 6 DOD Department of Defense Defense Secur…
## 7 7 DOD Department of Defense Defense Secur…
## 8 8 DOD Department of Defense Defense Secur…
## 9 9 DOD Department of Defense Defense Secur…
## 10 10 DOD Department of Defense Defense Secur…
## # ℹ 8,287 more rows
## # ℹ 17 more variables: channel_category_name <chr>,
## # channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, activity_start_date <date>, activity_end_date <date>,
## # fiscal_year <dbl>, current_amount <dbl>, funding_agency_name <chr>, …
head(us)
## # A tibble: 6 × 21
## `_id` implementing_agency_acronym implementing_agency_name subagency_name
## <dbl> <chr> <chr> <chr>
## 1 1 USAID U.S. Agency for Internationa… Bureau for As…
## 2 2 DOD Department of Defense Office of the…
## 3 3 DOD Department of Defense Office of the…
## 4 4 DOD Department of Defense Defense Secur…
## 5 5 DOD Department of Defense Defense Secur…
## 6 6 DOD Department of Defense Defense Secur…
## # ℹ 17 more variables: channel_category_name <chr>,
## # channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, activity_start_date <date>, activity_end_date <date>,
## # fiscal_year <dbl>, current_amount <dbl>, funding_agency_name <chr>,
## # funding_agency_acronym <chr>, aid_type_group_name <chr>, …
#we will split the data into having activity dates and without dates
#dataset without dates
usw <- us
#remove vairables form data set which are duplicate
usw$activity_end_date <- NULL
usw$activity_start_date <- NULL
usw$implementing_agency_acronym <- NULL
usw$implementing_agency_acronym <- NULL
usw$funding_agency_acronym <- NULL
dim(usw)
## [1] 8297 17
#we have two data sets us with dates usw without dates variables
#usw has no missing data
missmap(usw)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
psych::describe(usw)
## vars n mean sd median trimmed
## _id 1 8297 4149.00 2395.28 4149 4149.00
## implementing_agency_name* 2 8297 14.44 3.38 16 15.21
## subagency_name* 3 8297 26.53 11.52 32 27.44
## channel_category_name* 4 8297 3.24 1.36 3 3.13
## channel_subcategory_name* 5 8297 6.93 3.87 6 6.60
## channel_name* 6 8297 198.46 103.87 215 200.01
## dac_category_name* 7 8297 5.19 2.76 6 5.25
## dac_sector_name* 8 8297 14.10 7.55 14 14.00
## dac_purpose_name* 9 8297 46.13 24.58 55 47.34
## funding_account_name* 10 8297 25.81 14.41 35 26.08
## assistance_category_name* 11 8297 1.02 0.13 1 1.00
## activity_name* 12 8297 624.21 399.15 597 635.35
## fiscal_year 13 8297 2012.77 4.53 2013 2012.91
## current_amount 14 8297 2027410.46 16522468.05 85000 357284.73
## funding_agency_name* 15 8297 12.88 4.00 15 13.77
## aid_type_group_name* 16 8297 4.11 1.75 5 4.28
## activity_description* 17 8297 556.09 421.19 526 538.16
## mad min max range skew
## _id 3074.91 1 8297 8296 0.00
## implementing_agency_name* 0.00 1 16 15 -2.08
## subagency_name* 0.00 1 45 44 -0.98
## channel_category_name* 1.48 1 7 6 0.71
## channel_subcategory_name* 4.45 1 16 15 0.68
## channel_name* 152.71 1 365 364 -0.05
## dac_category_name* 2.97 1 10 9 -0.39
## dac_sector_name* 10.38 1 30 29 0.05
## dac_purpose_name* 20.76 1 92 91 -0.43
## funding_account_name* 8.90 1 58 57 -0.25
## assistance_category_name* 0.00 1 2 1 7.28
## activity_name* 557.46 1 1166 1165 -0.07
## fiscal_year 4.45 2001 2021 20 -0.24
## current_amount 125223.36 -84169377 600000000 684169377 19.99
## funding_agency_name* 0.00 1 15 14 -1.69
## aid_type_group_name* 0.00 1 6 5 -1.10
## activity_description* 518.91 1 1378 1377 0.38
## kurtosis se
## _id -1.20 26.30
## implementing_agency_name* 3.46 0.04
## subagency_name* -0.41 0.13
## channel_category_name* -0.43 0.01
## channel_subcategory_name* -1.00 0.04
## channel_name* -1.32 1.14
## dac_category_name* -1.13 0.03
## dac_sector_name* -0.91 0.08
## dac_purpose_name* -0.99 0.27
## funding_account_name* -1.02 0.16
## assistance_category_name* 51.07 0.00
## activity_name* -1.41 4.38
## fiscal_year -0.56 0.05
## current_amount 495.47 181390.43
## funding_agency_name* 1.63 0.04
## aid_type_group_name* -0.58 0.02
## activity_description* -1.11 4.62
head(usw)
## # A tibble: 6 × 17
## `_id` implementing_agency_name subagency_name channel_category_name
## <dbl> <chr> <chr> <chr>
## 1 1 U.S. Agency for International Deve… Bureau for As… Government
## 2 2 Department of Defense Office of the… Government
## 3 3 Department of Defense Office of the… Government
## 4 4 Department of Defense Defense Secur… Government
## 5 5 Department of Defense Defense Secur… Government
## 6 6 Department of Defense Defense Secur… Government
## # ℹ 13 more variables: channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, fiscal_year <dbl>, current_amount <dbl>,
## # funding_agency_name <chr>, aid_type_group_name <chr>,
## # activity_description <chr>
us[c('channel_nature', 'channel_source')] <- str_split_fixed(us$channel_subcategory_name, '-', 2)
usw[c('channel_nature', 'channel_source')] <- str_split_fixed(us$channel_subcategory_name, '-', 2)
head(usw)
## # A tibble: 6 × 19
## `_id` implementing_agency_name subagency_name channel_category_name
## <dbl> <chr> <chr> <chr>
## 1 1 U.S. Agency for International Deve… Bureau for As… Government
## 2 2 Department of Defense Office of the… Government
## 3 3 Department of Defense Office of the… Government
## 4 4 Department of Defense Defense Secur… Government
## 5 5 Department of Defense Defense Secur… Government
## 6 6 Department of Defense Defense Secur… Government
## # ℹ 15 more variables: channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, fiscal_year <dbl>, current_amount <dbl>,
## # funding_agency_name <chr>, aid_type_group_name <chr>,
## # activity_description <chr>, channel_nature <chr>, channel_source <chr>
range(usw$current_amount)
## [1] -84169377 600000000
#EDA of data with start and end activity dates Since dataset is analyzed in two sets one with dates information (us) and another without dates (usw)
head(us)
## # A tibble: 6 × 23
## `_id` implementing_agency_acronym implementing_agency_name subagency_name
## <dbl> <chr> <chr> <chr>
## 1 1 USAID U.S. Agency for Internationa… Bureau for As…
## 2 2 DOD Department of Defense Office of the…
## 3 3 DOD Department of Defense Office of the…
## 4 4 DOD Department of Defense Defense Secur…
## 5 5 DOD Department of Defense Defense Secur…
## 6 6 DOD Department of Defense Defense Secur…
## # ℹ 19 more variables: channel_category_name <chr>,
## # channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, activity_start_date <date>, activity_end_date <date>,
## # fiscal_year <dbl>, current_amount <dbl>, funding_agency_name <chr>,
## # funding_agency_acronym <chr>, aid_type_group_name <chr>, …
colnames(us)
## [1] "_id" "implementing_agency_acronym"
## [3] "implementing_agency_name" "subagency_name"
## [5] "channel_category_name" "channel_subcategory_name"
## [7] "channel_name" "dac_category_name"
## [9] "dac_sector_name" "dac_purpose_name"
## [11] "funding_account_name" "assistance_category_name"
## [13] "activity_name" "activity_start_date"
## [15] "activity_end_date" "fiscal_year"
## [17] "current_amount" "funding_agency_name"
## [19] "funding_agency_acronym" "aid_type_group_name"
## [21] "activity_description" "channel_nature"
## [23] "channel_source"
us <- clean_names(us)
colSums(is.na(us))
## id implementing_agency_acronym
## 0 0
## implementing_agency_name subagency_name
## 0 0
## channel_category_name channel_subcategory_name
## 0 0
## channel_name dac_category_name
## 0 0
## dac_sector_name dac_purpose_name
## 0 0
## funding_account_name assistance_category_name
## 0 0
## activity_name activity_start_date
## 0 5966
## activity_end_date fiscal_year
## 5874 0
## current_amount funding_agency_name
## 0 0
## funding_agency_acronym aid_type_group_name
## 0 0
## activity_description channel_nature
## 0 0
## channel_source
## 0
us$funding_agency_acronym <- NULL
us$implementing_agency_acronym <- NULL
us$fiscal_year <- as.factor(us$fiscal_year)
#Information of number of projects with missing date information in both assistance categories
nostart_noend_numbers <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(is.na(activity_start_date) & is.na(activity_end_date)) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(fct_reorder(fiscal_year, count), count, label = count)) +
geom_bar(stat = "identity", fill = "blue")+ geom_text(hjust = 0.4)+
facet_grid(~assistance_category_name)+
theme(legend.position = "none")+
labs(
title = "Number of projects with no start date information",
subtitle = "Projects in two assistance groups", x = "Fiscal Years",
y = "Number of Projects"
)+coord_flip()
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
nostart_noend_numbers
start_noend_numbers <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(!is.na(activity_start_date) & is.na(activity_end_date)) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(fct_reorder(fiscal_year, count), count, label = count)) + geom_text(hjust = -40) +
geom_bar(stat = "identity", fill = "yellow")+
facet_grid(~assistance_category_name)+
theme(legend.position = "none")+
labs(
title = "Number of projects with no end date information",
subtitle = "Projects in two assistance groups", x = "Fiscal Years",
y = "Number of Projects"
)+
coord_flip()
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
start_noend_numbers
start_end_numbers <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(!is.na(activity_start_date) & !is.na(activity_end_date)) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(fct_reorder(fiscal_year, count), count, label = count)) + geom_text(hjust = -10) +
geom_bar(stat = "identity", fill = "green")+
facet_grid(~assistance_category_name)+
theme(legend.position = "none")+
labs(
title = "Number of projects with completion information",
subtitle = "Projects in two assistance groups", x = "Fiscal Years",
y = "Number of Projects"
)+
coord_flip()
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
start_end_numbers
Variables with acronymns are deleted since they add no new information to dataset.
start <- us %>% select(id, implementing_agency_name,subagency_name,channel_category_name,channel_subcategory_name, channel_name, dac_category_name, dac_sector_name, dac_purpose_name,funding_account_name, assistance_category_name,activity_name, activity_start_date, activity_end_date, fiscal_year, current_amount, funding_agency_name, aid_type_group_name, activity_description) %>% group_by(activity_end_date, activity_start_date) %>% filter(is.na(activity_start_date)) %>%
summarise(count =n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'activity_end_date'. You can override using
## the `.groups` argument.
start
## # A tibble: 39 × 3
## # Groups: activity_end_date [39]
## activity_end_date activity_start_date count
## <date> <date> <int>
## 1 NA NA 5846
## 2 2023-09-30 NA 18
## 3 2020-09-30 NA 14
## 4 2023-06-30 NA 8
## 5 2018-12-31 NA 5
## 6 2021-06-20 NA 5
## 7 2022-04-24 NA 5
## 8 2023-04-30 NA 5
## 9 2020-03-15 NA 4
## 10 2023-07-31 NA 4
## # ℹ 29 more rows
end <- us %>% select(id, implementing_agency_name,subagency_name,channel_category_name,channel_subcategory_name, channel_name, dac_category_name, dac_sector_name, dac_purpose_name,funding_account_name, assistance_category_name,activity_name, activity_start_date, activity_end_date, fiscal_year, current_amount, funding_agency_name, aid_type_group_name, activity_description) %>% group_by(activity_end_date, activity_start_date) %>% filter(is.na(activity_end_date)) %>%
summarise(count =n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'activity_end_date'. You can override using
## the `.groups` argument.
end
## # A tibble: 19 × 3
## # Groups: activity_end_date [1]
## activity_end_date activity_start_date count
## <date> <date> <int>
## 1 NA NA 5846
## 2 NA 2009-10-01 4
## 3 NA 2012-09-27 4
## 4 NA 2010-03-01 2
## 5 NA 2010-07-08 2
## 6 NA 2010-07-23 2
## 7 NA 2012-08-30 2
## 8 NA 2007-09-05 1
## 9 NA 2009-07-13 1
## 10 NA 2010-08-01 1
## 11 NA 2010-08-26 1
## 12 NA 2010-09-14 1
## 13 NA 2010-09-23 1
## 14 NA 2011-10-01 1
## 15 NA 2012-05-24 1
## 16 NA 2012-06-12 1
## 17 NA 2012-06-28 1
## 18 NA 2014-04-17 1
## 19 NA 2015-10-28 1
both <- us %>% select(id, implementing_agency_name,subagency_name,channel_category_name,channel_subcategory_name, channel_name, dac_category_name, dac_sector_name, dac_purpose_name,funding_account_name, assistance_category_name,activity_name, activity_start_date, activity_end_date, fiscal_year, current_amount, funding_agency_name, aid_type_group_name, activity_description) %>% group_by(activity_end_date, activity_start_date) %>% filter(is.na(activity_end_date) | is.na(activity_start_date)) %>%
summarise(count =n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'activity_end_date'. You can override using
## the `.groups` argument.
both
## # A tibble: 57 × 3
## # Groups: activity_end_date [39]
## activity_end_date activity_start_date count
## <date> <date> <int>
## 1 NA NA 5846
## 2 2023-09-30 NA 18
## 3 2020-09-30 NA 14
## 4 2023-06-30 NA 8
## 5 2018-12-31 NA 5
## 6 2021-06-20 NA 5
## 7 2022-04-24 NA 5
## 8 2023-04-30 NA 5
## 9 2020-03-15 NA 4
## 10 2023-07-31 NA 4
## # ℹ 47 more rows
amount_nostart_butend <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(is.na(activity_start_date) & !is.na(activity_end_date)) %>%
summarise(max_amount = max(current_amount), min_amount = min(current_amount), avg_amount = mean(current_amount)) %>% ggplot(aes(fiscal_year, y = avg_amount))+geom_bar(stat = "identity", fill = "yellow")+
theme(legend.position = "none")+
labs(
title = "An average current amount per fiscal year",
subtitle = "Average amunt as per assistance category name", x = "Fiscal Years",
y = "Avg Current Amount"
)+
facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
amount_nostart_butend
amount_start_noend <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(!is.na(activity_start_date) & is.na(activity_end_date)) %>%
summarise(max_amount = max(current_amount), min_amount = min(current_amount), avg_amount = mean(current_amount)) %>% ggplot(aes(fiscal_year, y = avg_amount))+geom_bar(stat = "identity", fill = "green")+
theme(legend.position = "none")+
labs(
title = "An average current amount per fiscal year",
subtitle = "Average amunt as per assistance category name", x = "Fiscal Years",
y = "Avg Current Amount"
)+
facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
amount_start_noend
amount_start_end <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(!is.na(activity_start_date) & !is.na(activity_end_date)) %>%
summarise(max_amount = max(current_amount), min_amount = min(current_amount), avg_amount = mean(current_amount)) %>% ggplot(aes(fiscal_year, y = avg_amount))+geom_bar(stat = "identity", fill = "blue")+
theme(legend.position = "none")+
labs(
title = "An average current amount per fiscal year",
subtitle = "Average amunt as per assistance category name", x = "Fiscal Years",
y = "Avg Current Amount"
)+
facet_grid(~assistance_category_name)+ coord_flip()
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
amount_start_end
amount_nostart_noend <- us %>%
select(implementing_agency_name, subagency_name, channel_category_name, channel_subcategory_name, current_amount, fiscal_year,dac_sector_name,dac_purpose_name,funding_account_name,assistance_category_name, activity_name, funding_agency_name, aid_type_group_name, aid_type_group_name, channel_nature, channel_source, activity_start_date, activity_end_date ) %>%
group_by(fiscal_year,assistance_category_name,activity_start_date,activity_end_date) %>% filter(is.na(activity_start_date) & is.na(activity_end_date)) %>%
summarise(max_amount = max(current_amount), min_amount = min(current_amount), avg_amount = mean(current_amount)) %>% ggplot(aes(fiscal_year, y = avg_amount))+geom_bar(stat = "identity", fill = "red")+
theme(legend.position = "none")+
labs(
title = "An average current amount per fiscal year",
subtitle = "Average amunt as per assistance category name", x = "Fiscal Years",
y = "Avg Current Amount"
)+
facet_grid(~assistance_category_name)+coord_flip()
## `summarise()` has grouped output by 'fiscal_year', 'assistance_category_name',
## 'activity_start_date'. You can override using the `.groups` argument.
amount_nostart_noend
#EDA of data with dates
colnames(us)
## [1] "id" "implementing_agency_name"
## [3] "subagency_name" "channel_category_name"
## [5] "channel_subcategory_name" "channel_name"
## [7] "dac_category_name" "dac_sector_name"
## [9] "dac_purpose_name" "funding_account_name"
## [11] "assistance_category_name" "activity_name"
## [13] "activity_start_date" "activity_end_date"
## [15] "fiscal_year" "current_amount"
## [17] "funding_agency_name" "aid_type_group_name"
## [19] "activity_description" "channel_nature"
## [21] "channel_source"
clean_names(us)
## # A tibble: 8,297 × 21
## id implementing_agency_name subagency_name channel_category_name
## <dbl> <chr> <chr> <chr>
## 1 1 U.S. Agency for International Dev… Bureau for As… Government
## 2 2 Department of Defense Office of the… Government
## 3 3 Department of Defense Office of the… Government
## 4 4 Department of Defense Defense Secur… Government
## 5 5 Department of Defense Defense Secur… Government
## 6 6 Department of Defense Defense Secur… Government
## 7 7 Department of Defense Defense Secur… Government
## 8 8 Department of Defense Defense Secur… Government
## 9 9 Department of Defense Defense Secur… Government
## 10 10 Department of Defense Defense Secur… Government
## # ℹ 8,287 more rows
## # ℹ 17 more variables: channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, activity_start_date <date>, activity_end_date <date>,
## # fiscal_year <fct>, current_amount <dbl>, funding_agency_name <chr>,
## # aid_type_group_name <chr>, activity_description <chr>, …
us$implementing_agency_acronym <- NULL
us$subagency_acronym <- NULL
us$funding_agency_acronym <- NULL
us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date) %>% filter(!is.na(activity_start_date)) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date'. You can override using the
## `.groups` argument.
## # A tibble: 400 × 5
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date [339]
## implementing_agency_name assistance_category_…¹ activity_start_date
## <chr> <chr> <date>
## 1 U.S. Agency for International Dev… Economic 2001-10-01
## 2 U.S. Agency for International Dev… Economic 2011-05-27
## 3 U.S. Agency for International Dev… Economic 2014-01-01
## 4 U.S. Agency for International Dev… Economic 2010-08-18
## 5 U.S. Agency for International Dev… Economic 2006-09-29
## 6 U.S. Agency for International Dev… Economic 2010-08-31
## 7 U.S. Agency for International Dev… Economic 2017-09-28
## 8 U.S. Agency for International Dev… Economic 2007-03-01
## 9 U.S. Agency for International Dev… Economic 2007-11-12
## 10 U.S. Agency for International Dev… Economic 2013-05-24
## # ℹ 390 more rows
## # ℹ abbreviated name: ¹​assistance_category_name
## # ℹ 2 more variables: activity_end_date <date>, count <int>
us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date) %>% filter(!is.na(activity_end_date) ) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date'. You can override using the
## `.groups` argument.
## # A tibble: 427 × 5
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date [339]
## implementing_agency_name assistance_category_…¹ activity_start_date
## <chr> <chr> <date>
## 1 U.S. Agency for International Dev… Economic 2001-10-01
## 2 U.S. Agency for International Dev… Economic 2011-05-27
## 3 U.S. Agency for International Dev… Economic 2014-01-01
## 4 U.S. Agency for International Dev… Economic 2010-08-18
## 5 U.S. Agency for International Dev… Economic 2006-09-29
## 6 U.S. Agency for International Dev… Economic 2010-08-31
## 7 U.S. Agency for International Dev… Economic 2017-09-28
## 8 U.S. Agency for International Dev… Economic 2007-03-01
## 9 U.S. Agency for International Dev… Economic 2007-11-12
## 10 U.S. Agency for International Dev… Economic 2013-05-24
## # ℹ 417 more rows
## # ℹ abbreviated name: ¹​assistance_category_name
## # ℹ 2 more variables: activity_end_date <date>, count <int>
us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date) %>% filter(!is.na(activity_start_date) & !is.na(activity_end_date) ) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date'. You can override using the
## `.groups` argument.
## # A tibble: 382 × 5
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date [329]
## implementing_agency_name assistance_category_…¹ activity_start_date
## <chr> <chr> <date>
## 1 U.S. Agency for International Dev… Economic 2001-10-01
## 2 U.S. Agency for International Dev… Economic 2011-05-27
## 3 U.S. Agency for International Dev… Economic 2014-01-01
## 4 U.S. Agency for International Dev… Economic 2010-08-18
## 5 U.S. Agency for International Dev… Economic 2006-09-29
## 6 U.S. Agency for International Dev… Economic 2010-08-31
## 7 U.S. Agency for International Dev… Economic 2017-09-28
## 8 U.S. Agency for International Dev… Economic 2007-03-01
## 9 U.S. Agency for International Dev… Economic 2007-11-12
## 10 U.S. Agency for International Dev… Economic 2013-05-24
## # ℹ 372 more rows
## # ℹ abbreviated name: ¹​assistance_category_name
## # ℹ 2 more variables: activity_end_date <date>, count <int>
us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date) %>% filter(!is.na(activity_start_date) | !is.na(activity_end_date) ) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date'. You can override using the
## `.groups` argument.
## # A tibble: 445 × 5
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date [349]
## implementing_agency_name assistance_category_…¹ activity_start_date
## <chr> <chr> <date>
## 1 U.S. Agency for International Dev… Economic 2001-10-01
## 2 U.S. Agency for International Dev… Economic 2011-05-27
## 3 U.S. Agency for International Dev… Economic 2014-01-01
## 4 U.S. Agency for International Dev… Economic 2010-08-18
## 5 U.S. Agency for International Dev… Economic 2006-09-29
## 6 U.S. Agency for International Dev… Economic 2010-08-31
## 7 U.S. Agency for International Dev… Economic 2017-09-28
## 8 U.S. Agency for International Dev… Economic 2007-03-01
## 9 U.S. Agency for International Dev… Economic 2007-11-12
## 10 U.S. Agency for International Dev… Economic 2013-05-24
## # ℹ 435 more rows
## # ℹ abbreviated name: ¹​assistance_category_name
## # ℹ 2 more variables: activity_end_date <date>, count <int>
agency_dates_economic <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(!is.na(activity_start_date) & !is.na(activity_end_date) & assistance_category_name == "Economic") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
agency_dates_economic
## # A tibble: 386 × 6
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date, activity_end_date [374]
## implementing_agency_name assistance_category_…¹ activity_start_date
## <chr> <chr> <date>
## 1 U.S. Agency for International Dev… Economic 2011-05-27
## 2 U.S. Agency for International Dev… Economic 2014-01-01
## 3 U.S. Agency for International Dev… Economic 2001-10-01
## 4 U.S. Agency for International Dev… Economic 2010-08-18
## 5 U.S. Agency for International Dev… Economic 2006-09-29
## 6 U.S. Agency for International Dev… Economic 2010-08-31
## 7 U.S. Agency for International Dev… Economic 2017-09-28
## 8 U.S. Agency for International Dev… Economic 2007-03-01
## 9 U.S. Agency for International Dev… Economic 2007-11-12
## 10 U.S. Agency for International Dev… Economic 2013-05-24
## # ℹ 376 more rows
## # ℹ abbreviated name: ¹​assistance_category_name
## # ℹ 3 more variables: activity_end_date <date>, channel_name <chr>, count <int>
agency_dates_military <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(!is.na(activity_start_date) & !is.na(activity_end_date) & assistance_category_name == "Military") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
agency_dates_military
## # A tibble: 9 × 6
## # Groups: implementing_agency_name, assistance_category_name,
## # activity_start_date, activity_end_date [8]
## implementing_agency_name assistance_category_name activity_start_date
## <chr> <chr> <date>
## 1 Department of Defense Military 2014-01-15
## 2 Department of Defense Military 2009-01-01
## 3 Department of Defense Military 2014-01-15
## 4 Department of Defense Military 2017-10-01
## 5 Department of the Air Force Military 2015-10-01
## 6 Department of the Air Force Military 2016-10-01
## 7 Department of the Army Military 2015-10-01
## 8 Department of the Army Military 2017-10-01
## 9 Department of the Navy Military 2015-10-01
## # ℹ 3 more variables: activity_end_date <date>, channel_name <chr>, count <int>
agency_noend_economic <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date) %>% filter(!is.na(activity_start_date) & is.na(activity_end_date) & assistance_category_name == "Economic") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date'. You can override using the
## `.groups` argument.
noend_eco <- ggplot(agency_noend_economic, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "green") +
#geom_text(hjust = 0.2) +
coord_flip() +
theme(legend.position = "none") +
labs(
title = "The Number of Implementing agencies",
subtitle = "Implementing agencies in Economic sector with No end date",
#y = "Number of projects per year"
x = "Name of agencies"
#caption = "US aid program 2001-2023"
)
agency_nostart_economic <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(is.na(activity_start_date) & !is.na(activity_end_date) & assistance_category_name == "Economic") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
nostart_eco <- ggplot(agency_nostart_economic, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "blue") +
#geom_text(hjust = 0.2)+
coord_flip()+
theme(legend.position = "none")+
labs(
subtitle = "Implementing agencies in Economic sector with No start date",x = "Name of agencies"
#caption = "US aid program 2001-2023",
)
agency_noclue_economic <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(is.na(activity_start_date) & is.na(activity_end_date) & assistance_category_name == "Economic") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
nodate_eco <- ggplot(agency_noclue_economic, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "red") +
coord_flip()+
theme(legend.position = "none")+
labs(
subtitle = "Implementing agencies in Economic sector with No date info",
caption = "US aid program 2001-2023",y = "Number of projects per year", x = "Name of agencies"
)
plot_grid(noend_eco, nostart_eco,nodate_eco, nrow=3)
agency_noend_military <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(!is.na(activity_start_date) & is.na(activity_end_date) & assistance_category_name == "Military") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
noend_military <- ggplot(agency_noend_military, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "green") +
coord_flip()+
theme(legend.position = "none")+
labs(
title = "The Number of Implementing agencies",
subtitle = "Implementing agencies in Economic sector with No end date", x = "Name of agencies"
#y = "Number of projects per year"
)
agency_nostart_military <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(is.na(activity_start_date) & !is.na(activity_end_date) & assistance_category_name == "Military") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
nostart_military <- ggplot(agency_nostart_military, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "blue") +
coord_flip()+
theme(legend.position = "none")+
labs(
subtitle = "Implementing agencies in Economic sector with No start date",
#caption = "US aid program 2001-2023",
x = "Name of agencies"
)
agency_noclue_military <- us %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name, activity_start_date, activity_end_date) %>% group_by(implementing_agency_name,assistance_category_name,activity_start_date, activity_end_date,channel_name) %>% filter(is.na(activity_start_date) & is.na(activity_end_date) & assistance_category_name == "Military") %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name',
## 'assistance_category_name', 'activity_start_date', 'activity_end_date'. You can
## override using the `.groups` argument.
nodate_military <- ggplot(agency_noclue_military, aes(
x = fct_reorder(implementing_agency_name,count), count))+
geom_bar(stat = "identity", fill = "red") +
coord_flip()+
theme(legend.position = "none")+
labs(
subtitle = "Implementing agencies in Economic sector with No date info",
caption = "US aid program 2001-2023",y = "Number of projects per year", x = "Name of agencies"
)
plot_grid(noend_military, nostart_military,nodate_military, nrow=3)
#EDA of Data without dates
imple_proj_num <- usw %>% select(channel_name, channel_source, channel_nature, implementing_agency_name,aid_type_group_name,assistance_category_name) %>% group_by(implementing_agency_name,assistance_category_name) %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'implementing_agency_name'. You can
## override using the `.groups` argument.
#number of implementing agencies
ggplot(imple_proj_num, aes(
x = fct_reorder(implementing_agency_name,count), count, label = count, fill = implementing_agency_name))+
geom_col() +
geom_text(hjust = 0.2)+
coord_flip()+
theme(legend.position = "none")+
labs(
title = "The Number of Implementing agencies",
subtitle = "Details about implementing agencies",
caption = "US aid program 2001-2023",
y = "Number of projects per year", x = "Name of agencies"
)+facet_wrap(~assistance_category_name)
agency_year <- xtabs(~ implementing_agency_name + fiscal_year , data = usw)
addmargins(agency_year)
## fiscal_year
## implementing_agency_name 2001 2002 2003 2004 2005 2006 2007
## Department of Agriculture 0 1 14 7 4 8 2
## Department of Commerce 0 0 0 0 0 1 1
## Department of Defense 1 3 3 3 3 5 4
## Department of Energy 0 0 0 0 0 0 4
## Department of Health and Human Services 3 3 3 3 3 5 3
## Department of Homeland Security 0 0 0 0 0 0 2
## Department of Justice 0 0 0 0 0 0 0
## Department of Labor 0 1 1 0 1 1 1
## Department of State 33 24 32 25 41 44 74
## Department of the Air Force 0 0 19 2 4 0 1
## Department of the Army 0 0 1 0 0 0 0
## Department of the Interior 0 0 0 0 0 0 0
## Department of the Navy 0 0 0 1 0 1 0
## Department of the Treasury 0 1 1 1 0 0 1
## Trade and Development Agency 0 3 6 3 5 3 5
## U.S. Agency for International Development 13 25 47 82 88 188 215
## Sum 50 61 127 127 149 256 313
## fiscal_year
## implementing_agency_name 2008 2009 2010 2011 2012 2013 2014
## Department of Agriculture 1 4 10 11 10 2 1
## Department of Commerce 0 0 1 1 2 0 1
## Department of Defense 4 6 5 8 6 5 10
## Department of Energy 4 3 4 5 3 2 2
## Department of Health and Human Services 3 4 4 2 1 3 2
## Department of Homeland Security 0 0 0 1 0 0 1
## Department of Justice 2 2 4 0 3 1 0
## Department of Labor 0 2 0 0 0 0 0
## Department of State 58 64 80 77 67 67 86
## Department of the Air Force 1 0 2 0 0 0 1
## Department of the Army 0 0 4 0 0 1 6
## Department of the Interior 0 0 0 1 2 2 0
## Department of the Navy 0 1 13 0 0 0 0
## Department of the Treasury 1 2 2 4 0 0 0
## Trade and Development Agency 0 1 9 9 16 11 7
## U.S. Agency for International Development 290 441 511 557 492 589 478
## Sum 364 530 649 676 602 683 595
## fiscal_year
## implementing_agency_name 2015 2016 2017 2018 2019 2020 2021
## Department of Agriculture 1 8 2 1 1 1 0
## Department of Commerce 0 0 0 1 0 0 0
## Department of Defense 5 4 3 2 4 1 0
## Department of Energy 1 1 1 1 1 1 0
## Department of Health and Human Services 0 8 29 24 10 2 0
## Department of Homeland Security 1 0 2 0 0 0 0
## Department of Justice 0 0 0 0 0 0 0
## Department of Labor 0 0 0 0 0 0 0
## Department of State 71 70 101 72 48 9 0
## Department of the Air Force 0 1 1 2 4 1 0
## Department of the Army 2 2 0 1 1 1 0
## Department of the Interior 0 0 0 2 0 0 0
## Department of the Navy 2 1 0 0 0 0 0
## Department of the Treasury 0 0 0 0 0 0 0
## Trade and Development Agency 4 3 1 0 1 1 0
## U.S. Agency for International Development 507 435 423 365 342 337 188
## Sum 594 533 563 471 412 354 188
## fiscal_year
## implementing_agency_name Sum
## Department of Agriculture 89
## Department of Commerce 8
## Department of Defense 85
## Department of Energy 33
## Department of Health and Human Services 115
## Department of Homeland Security 7
## Department of Justice 12
## Department of Labor 7
## Department of State 1143
## Department of the Air Force 39
## Department of the Army 19
## Department of the Interior 7
## Department of the Navy 19
## Department of the Treasury 13
## Trade and Development Agency 88
## U.S. Agency for International Development 6613
## Sum 8297
usw %>% select(fiscal_year, implementing_agency_name,assistance_category_name) %>% group_by(fiscal_year,assistance_category_name) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fiscal_year, y = count, fill = assistance_category_name))+
geom_col()+
theme_classic()+
labs(
title = "Details about number of projects in each Fiscal Year",
subtitle = "The count of projects in every fiscal year",
caption = "US aid program 2001-2023",
x = "Fiscal year", y = "Number of projects"
)+scale_colour_continuous()
## `summarise()` has grouped output by 'fiscal_year'. You can override using the
## `.groups` argument.
assitance_year <- xtabs(~ fiscal_year + assistance_category_name , data = usw)
addmargins(assitance_year)
## assistance_category_name
## fiscal_year Economic Military Sum
## 2001 49 1 50
## 2002 58 3 61
## 2003 104 23 127
## 2004 121 6 127
## 2005 142 7 149
## 2006 251 5 256
## 2007 309 4 313
## 2008 360 4 364
## 2009 525 5 530
## 2010 628 21 649
## 2011 670 6 676
## 2012 597 5 602
## 2013 678 5 683
## 2014 579 16 595
## 2015 585 9 594
## 2016 525 8 533
## 2017 559 4 563
## 2018 466 5 471
## 2019 403 9 412
## 2020 352 2 354
## 2021 188 0 188
## Sum 8149 148 8297
usw %>% select(assistance_category_name, fiscal_year) %>% group_by(assistance_category_name) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(
x = fct_reorder(assistance_category_name,count), count, label = count, fill = assistance_category_name))+
geom_col() +
geom_text(hjust = -0.2)+
coord_flip()+
theme(legend.position = "none") +
labs(
title = "The Number of Assistance categoies",
subtitle = "Details about Assistance categoies",
caption = "US aid program 2001-2023",
y = "Number of projects per year", x = "Assistance cateroies")+scale_fill_discrete()
economic <- usw %>% select(fiscal_year, assistance_category_name) %>% group_by(fiscal_year) %>% filter (assistance_category_name == "Economic")%>%
summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fiscal_year, y = count))+
geom_bar(stat = "identity",fill = "blue")+
theme_classic()+
labs(
title = "Number of Economic projects in each Fiscal Year",
subtitle = "Economic projects",
#caption = "US aid program 2001-2023",
x = "Fiscal year", y = "Economic projects"
)+
scale_fill_brewer()
military <- usw %>% select(fiscal_year, assistance_category_name) %>% group_by(fiscal_year) %>% filter (assistance_category_name == "Military") %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fiscal_year, y = count))+
geom_bar(stat = "identity" , fill = "red")+
theme_classic()+
labs(
title = "Number of Military projects in each Fiscal Year",
subtitle = "Military projects",
caption = "US aid program 2001-2023",
x = "Fiscal year", y = "Military projects"
)+
scale_fill_brewer()
plot_grid(economic, military, ncol = 1)
aid_type_year <- xtabs(~ fiscal_year + aid_type_group_name , data = usw)
addmargins(aid_type_year)
## aid_type_group_name
## fiscal_year Administrative Costs Budget support Core contributions Debt relief
## 2001 1 0 3 0
## 2002 5 1 4 0
## 2003 16 0 7 1
## 2004 17 0 5 1
## 2005 20 0 9 0
## 2006 20 0 9 0
## 2007 28 7 17 0
## 2008 48 0 19 0
## 2009 79 0 12 0
## 2010 95 3 18 0
## 2011 118 5 19 0
## 2012 102 12 13 0
## 2013 202 9 4 0
## 2014 194 6 7 0
## 2015 173 9 10 0
## 2016 153 2 6 0
## 2017 165 0 2 0
## 2018 134 0 0 0
## 2019 121 0 1 0
## 2020 125 1 2 0
## 2021 76 0 0 0
## Sum 1892 55 167 2
## aid_type_group_name
## fiscal_year Project-Type Technical Assistance Sum
## 2001 45 1 50
## 2002 43 8 61
## 2003 90 13 127
## 2004 92 12 127
## 2005 104 16 149
## 2006 202 25 256
## 2007 228 33 313
## 2008 254 43 364
## 2009 376 63 530
## 2010 442 91 649
## 2011 461 73 676
## 2012 412 63 602
## 2013 419 49 683
## 2014 359 29 595
## 2015 375 27 594
## 2016 345 27 533
## 2017 368 28 563
## 2018 317 20 471
## 2019 279 11 412
## 2020 210 16 354
## 2021 110 2 188
## Sum 5531 650 8297
usw %>% select(fiscal_year, aid_type_group_name,assistance_category_name) %>% group_by(aid_type_group_name,assistance_category_name) %>% summarise(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(aid_type_group_name, count), y = count, label = count))+
geom_bar(stat = "identity", fill = "skyblue")+
scale_fill_brewer()+
theme_classic()+
labs(
title = "Number of Military projects in each Fiscal Year",
subtitle = "Military projects",
caption = "US aid program 2001-2023",
x = "Fiscal year", y = "Military projects"
)+ coord_flip()+ geom_text(hjust = 0.2)+ facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'aid_type_group_name'. You can override
## using the `.groups` argument.
source_year <- xtabs(~ channel_source + fiscal_year , data = usw)
addmargins(source_year)
## fiscal_year
## channel_source 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 0 0 0 0 0 0 0 0 0
## International 23 12 14 14 11 17 32 21 22
## International Monetary Fund 0 0 0 0 0 0 0 0 0
## Non United States 1 4 13 17 28 58 70 84 113
## Other 0 1 0 0 2 4 2 3 5
## United Nations 3 9 19 7 9 35 17 24 45
## United States 23 35 81 89 99 142 192 232 344
## World Bank Group 0 0 0 0 0 0 0 0 1
## Sum 50 61 127 127 149 256 313 364 530
## fiscal_year
## channel_source 2010 2011 2012 2013 2014 2015 2016 2017 2018
## 0 0 0 0 0 1 0 0 0
## International 17 24 16 14 13 6 2 2 2
## International Monetary Fund 0 0 0 0 0 0 0 2 1
## Non United States 164 194 213 263 236 247 213 211 173
## Other 6 12 12 12 8 13 8 3 5
## United Nations 66 67 48 23 15 25 26 24 25
## United States 396 373 311 369 321 299 282 319 263
## World Bank Group 0 6 2 2 2 3 2 2 2
## Sum 649 676 602 683 595 594 533 563 471
## fiscal_year
## channel_source 2019 2020 2021 Sum
## 0 0 0 1
## International 5 7 3 277
## International Monetary Fund 0 0 0 3
## Non United States 163 130 81 2676
## Other 6 6 2 110
## United Nations 19 14 7 527
## United States 218 195 95 4678
## World Bank Group 1 2 0 25
## Sum 412 354 188 8297
usw %>% select(channel_source, fiscal_year,assistance_category_name) %>% group_by(channel_source,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(channel_source, count), count,label = count))+ geom_bar(stat= "identity", fill = "purple")+
coord_flip()+theme(legend.position = "none")+geom_text(hjust = 0.25)+
labs(
title = "Number of projects per channel source",
subtitle = "Channel sources of Projects",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Channel sources"
)+facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'channel_source'. You can override using
## the `.groups` argument.
dac_cat_year <- xtabs(~ dac_category_name + fiscal_year , data = usw)
usw %>% select(dac_category_name, fiscal_year,assistance_category_name) %>% group_by(dac_category_name,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(dac_category_name, count), count,label = count))+ geom_bar(stat= "identity", fill = "orange")+
coord_flip()+theme(legend.position = "none")+geom_text(hjust = 0.6)+
labs(
title = "Number of projects per Categories",
subtitle = "Category sources of Projects",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Category Names"
)+facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'dac_category_name'. You can override using
## the `.groups` argument.
activ_year <- xtabs(~ funding_agency_name + fiscal_year , data = usw)
addmargins(activ_year)
## fiscal_year
## funding_agency_name 2001 2002 2003 2004 2005 2006 2007
## Department of Agriculture 4 11 22 18 4 24 2
## Department of Commerce 0 0 0 0 0 0 0
## Department of Defense 1 3 23 6 7 6 5
## Department of Energy 0 0 0 0 0 0 4
## Department of Health and Human Services 3 3 3 3 3 5 3
## Department of Justice 0 0 0 0 0 0 0
## Department of Labor 0 1 1 0 1 1 1
## Department of State 33 23 32 23 33 42 71
## Department of the Air Force 0 0 0 0 0 0 0
## Department of the Army 0 0 0 0 0 0 0
## Department of the Interior 0 0 0 0 0 0 0
## Department of the Navy 0 0 0 0 0 0 0
## Department of the Treasury 0 1 0 0 0 0 1
## Trade and Development Agency 0 3 6 3 5 3 5
## U.S. Agency for International Development 9 16 40 74 96 175 221
## Sum 50 61 127 127 149 256 313
## fiscal_year
## funding_agency_name 2008 2009 2010 2011 2012 2013 2014
## Department of Agriculture 11 23 37 42 39 13 11
## Department of Commerce 0 0 0 0 1 0 0
## Department of Defense 5 6 21 7 5 5 13
## Department of Energy 4 2 4 3 3 2 2
## Department of Health and Human Services 3 4 4 2 1 3 2
## Department of Justice 1 0 0 0 3 1 0
## Department of Labor 0 2 0 0 0 0 0
## Department of State 58 76 102 95 85 97 120
## Department of the Air Force 0 0 0 0 0 0 0
## Department of the Army 0 1 3 1 1 1 4
## Department of the Interior 0 0 0 1 2 2 0
## Department of the Navy 0 0 0 0 0 0 0
## Department of the Treasury 1 2 2 4 0 0 0
## Trade and Development Agency 0 1 9 9 4 1 1
## U.S. Agency for International Development 281 413 467 512 458 558 442
## Sum 364 530 649 676 602 683 595
## fiscal_year
## funding_agency_name 2015 2016 2017 2018 2019 2020 2021
## Department of Agriculture 10 22 18 16 12 7 3
## Department of Commerce 0 0 0 1 0 0 0
## Department of Defense 9 5 3 2 4 1 0
## Department of Energy 1 1 1 1 1 1 0
## Department of Health and Human Services 0 8 29 24 10 2 0
## Department of Justice 0 0 0 0 0 0 0
## Department of Labor 0 0 0 0 0 0 0
## Department of State 98 94 65 65 56 33 15
## Department of the Air Force 0 1 1 2 4 1 0
## Department of the Army 0 1 0 1 1 1 0
## Department of the Interior 0 0 0 1 0 0 0
## Department of the Navy 0 1 0 0 0 0 0
## Department of the Treasury 0 0 0 0 0 0 0
## Trade and Development Agency 0 0 0 0 0 1 0
## U.S. Agency for International Development 476 400 446 358 324 307 170
## Sum 594 533 563 471 412 354 188
## fiscal_year
## funding_agency_name Sum
## Department of Agriculture 349
## Department of Commerce 2
## Department of Defense 137
## Department of Energy 30
## Department of Health and Human Services 115
## Department of Justice 5
## Department of Labor 7
## Department of State 1316
## Department of the Air Force 9
## Department of the Army 15
## Department of the Interior 6
## Department of the Navy 1
## Department of the Treasury 11
## Trade and Development Agency 51
## U.S. Agency for International Development 6243
## Sum 8297
usw %>% select(funding_agency_name, fiscal_year,assistance_category_name) %>% group_by(funding_agency_name,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(funding_agency_name, count), count,label = count))+ geom_bar(stat= "identity", fill = "pink")+
coord_flip()+theme(legend.position = "none")+geom_text(hjust = 0.4)+
labs(
title = "Funding agencies for Projects",
subtitle = "Details of funding agencies",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Funding agency"
)+facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'funding_agency_name'. You can override
## using the `.groups` argument.
aid_gp_year <- xtabs(~ aid_type_group_name + fiscal_year , data = usw)
addmargins(aid_gp_year)
## fiscal_year
## aid_type_group_name 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## Administrative Costs 1 5 16 17 20 20 28 48 79 95 118
## Budget support 0 1 0 0 0 0 7 0 0 3 5
## Core contributions 3 4 7 5 9 9 17 19 12 18 19
## Debt relief 0 0 1 1 0 0 0 0 0 0 0
## Project-Type 45 43 90 92 104 202 228 254 376 442 461
## Technical Assistance 1 8 13 12 16 25 33 43 63 91 73
## Sum 50 61 127 127 149 256 313 364 530 649 676
## fiscal_year
## aid_type_group_name 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 Sum
## Administrative Costs 102 202 194 173 153 165 134 121 125 76 1892
## Budget support 12 9 6 9 2 0 0 0 1 0 55
## Core contributions 13 4 7 10 6 2 0 1 2 0 167
## Debt relief 0 0 0 0 0 0 0 0 0 0 2
## Project-Type 412 419 359 375 345 368 317 279 210 110 5531
## Technical Assistance 63 49 29 27 27 28 20 11 16 2 650
## Sum 602 683 595 594 533 563 471 412 354 188 8297
usw %>% select(aid_type_group_name, fiscal_year,assistance_category_name) %>% group_by(aid_type_group_name,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(aid_type_group_name, count), count,label = count))+
geom_bar(stat= "identity", fill = "magenta")+
coord_flip()+theme(legend.position = "none")+
geom_text(hjust = 0.7)+
labs(title = "Funding agencies for Projects",
subtitle = "Details of funding agencies",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Funding agency")+
facet_grid(~assistance_category_name)
## `summarise()` has grouped output by 'aid_type_group_name'. You can override
## using the `.groups` argument.
chan_src_year <- xtabs(~ channel_source + fiscal_year , data = usw)
addmargins(chan_src_year)
## fiscal_year
## channel_source 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 0 0 0 0 0 0 0 0 0
## International 23 12 14 14 11 17 32 21 22
## International Monetary Fund 0 0 0 0 0 0 0 0 0
## Non United States 1 4 13 17 28 58 70 84 113
## Other 0 1 0 0 2 4 2 3 5
## United Nations 3 9 19 7 9 35 17 24 45
## United States 23 35 81 89 99 142 192 232 344
## World Bank Group 0 0 0 0 0 0 0 0 1
## Sum 50 61 127 127 149 256 313 364 530
## fiscal_year
## channel_source 2010 2011 2012 2013 2014 2015 2016 2017 2018
## 0 0 0 0 0 1 0 0 0
## International 17 24 16 14 13 6 2 2 2
## International Monetary Fund 0 0 0 0 0 0 0 2 1
## Non United States 164 194 213 263 236 247 213 211 173
## Other 6 12 12 12 8 13 8 3 5
## United Nations 66 67 48 23 15 25 26 24 25
## United States 396 373 311 369 321 299 282 319 263
## World Bank Group 0 6 2 2 2 3 2 2 2
## Sum 649 676 602 683 595 594 533 563 471
## fiscal_year
## channel_source 2019 2020 2021 Sum
## 0 0 0 1
## International 5 7 3 277
## International Monetary Fund 0 0 0 3
## Non United States 163 130 81 2676
## Other 6 6 2 110
## United Nations 19 14 7 527
## United States 218 195 95 4678
## World Bank Group 1 2 0 25
## Sum 412 354 188 8297
usw %>% select(channel_source, fiscal_year,assistance_category_name) %>% group_by(channel_source,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
ggplot(aes(x = fct_reorder(channel_source, count),y=count, label = count))+
geom_bar(stat = "identity", fill = "blue")+
coord_flip()+theme(legend.position = "none")+
geom_text(vjust = -0.4)+
labs(title = "Funding agencies for Projects",
subtitle = "Details of funding agencies",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Funding agency")+
facet_wrap(~assistance_category_name)
## `summarise()` has grouped output by 'channel_source'. You can override using
## the `.groups` argument.
library(forcats)
chan_nat_year <- xtabs(~ channel_nature + fiscal_year , data = usw)
addmargins(chan_nat_year)
## fiscal_year
## channel_nature 2001 2002 2003 2004 2005 2006 2007 2008
## Church and Faith Based 1 0 4 3 3 4 9 7
## Enterprises 2 7 21 47 51 90 114 154
## Government 4 15 41 19 24 41 62 80
## Multilateral 3 10 19 7 11 39 19 27
## NGO 40 29 40 48 56 75 102 87
## Public and Private Partnerships 0 0 0 0 0 0 0 0
## Universities and Research Institutes 0 0 2 3 4 7 7 9
## Sum 50 61 127 127 149 256 313 364
## fiscal_year
## channel_nature 2009 2010 2011 2012 2013 2014 2015 2016
## Church and Faith Based 9 10 10 8 8 5 12 15
## Enterprises 225 269 264 211 309 244 256 202
## Government 109 153 138 130 146 164 145 149
## Multilateral 51 72 85 62 37 25 41 36
## NGO 128 135 169 175 173 137 114 113
## Public and Private Partnerships 0 0 0 0 0 0 1 0
## Universities and Research Institutes 8 10 10 16 10 20 25 18
## Sum 530 649 676 602 683 595 594 533
## fiscal_year
## channel_nature 2017 2018 2019 2020 2021 Sum
## Church and Faith Based 9 5 5 1 0 128
## Enterprises 214 178 174 156 98 3286
## Government 181 138 112 110 51 2012
## Multilateral 31 33 26 22 9 665
## NGO 109 98 80 54 28 1990
## Public and Private Partnerships 0 0 0 0 0 1
## Universities and Research Institutes 19 19 15 11 2 215
## Sum 563 471 412 354 188 8297
usw %>% select(channel_nature, fiscal_year,assistance_category_name) %>% group_by(channel_nature,assistance_category_name) %>% summarize(count = n()) %>% arrange(desc(count)) %>% mutate(channel_nature = as.factor(channel_nature)) %>%
ggplot(aes(x = fct_reorder(channel_nature, count),y=count, label = count))+
geom_bar(stat = "identity", fill = "blue")+
coord_flip()+theme(legend.position = "none")+
geom_text(vjust = -0.4)+
labs(title = "Funding agencies for Projects",
subtitle = "Details of funding agencies",
caption = "US aid program 2001-2023",
y = "Number of projects", x = "Funding agency")+
facet_wrap(~assistance_category_name)
## `summarise()` has grouped output by 'channel_nature'. You can override using
## the `.groups` argument.
head(usw)
## # A tibble: 6 × 19
## `_id` implementing_agency_name subagency_name channel_category_name
## <dbl> <chr> <chr> <chr>
## 1 1 U.S. Agency for International Deve… Bureau for As… Government
## 2 2 Department of Defense Office of the… Government
## 3 3 Department of Defense Office of the… Government
## 4 4 Department of Defense Defense Secur… Government
## 5 5 Department of Defense Defense Secur… Government
## 6 6 Department of Defense Defense Secur… Government
## # ℹ 15 more variables: channel_subcategory_name <chr>, channel_name <chr>,
## # dac_category_name <chr>, dac_sector_name <chr>, dac_purpose_name <chr>,
## # funding_account_name <chr>, assistance_category_name <chr>,
## # activity_name <chr>, fiscal_year <dbl>, current_amount <dbl>,
## # funding_agency_name <chr>, aid_type_group_name <chr>,
## # activity_description <chr>, channel_nature <chr>, channel_source <chr>
colnames(usw)
## [1] "_id" "implementing_agency_name"
## [3] "subagency_name" "channel_category_name"
## [5] "channel_subcategory_name" "channel_name"
## [7] "dac_category_name" "dac_sector_name"
## [9] "dac_purpose_name" "funding_account_name"
## [11] "assistance_category_name" "activity_name"
## [13] "fiscal_year" "current_amount"
## [15] "funding_agency_name" "aid_type_group_name"
## [17] "activity_description" "channel_nature"
## [19] "channel_source"
usw <- usw %>%
mutate(NumChar = nchar(activity_description),
NumWords = str_count(activity_description, patter = "\\w+" ))
range(usw$fiscal_year)
## [1] 2001 2021
datatable(usw %>% select(fiscal_year, activity_description,assistance_category_name) %>%
filter(fiscal_year == 2001 & assistance_category_name == "Economic"), caption = NULL, options = list(dom="t"))
datatable(usw %>% select(fiscal_year, activity_description,assistance_category_name) %>%
filter(fiscal_year == 2001 & assistance_category_name == "Military"), caption = NULL, options = list(dom="t"))
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:lightgbm':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
usw %>% select(NumChar, NumWords, fiscal_year) %>%
ggplot(aes(x = NumChar, y = NumWords,col = as.factor(fiscal_year))) +
geom_point() +
labs(title = "Number of character vs Number of words by year",
x = "Number of Characters",
col = "fiscal_year",
y = "Number of words")
usw %>% select(NumChar) %>%
ggplot(aes(x = NumChar)) +
geom_histogram(fill = "darkred", bins = 30) +
labs(title = "Number of characters used",
x = "Number of characters")
#Number of characters by Year(Boxplot)
usw %>% select(NumChar, fiscal_year) %>%
ggplot(aes(x = as.factor(fiscal_year), y = NumChar))+
geom_boxplot(aes(fill = as.factor(fiscal_year)))+
labs(title = "Number of character by year",
x = "Year",
fill = "year",
y = "Number of Characters")+
theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7), legend.position = "none")
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
usw %>% select(activity_description) %>% unnest_tokens(word, activity_description) %>% anti_join(get_stopwords()) %>%
count(word, sort = TRUE) %>%
slice_max(n, n = 20) %>%
ggplot(aes(
x = fct_reorder(word, n), y =n, fill = word))+
geom_col()+ coord_flip()+
theme(legend.position = "none") +
labs(
title = "The most word used",
subtitle = "Bar plot, The most word used",
caption = "Kaggle: All Trump's Twitter insults (2015-2021)",
x = "word",
y = "Count"
)
## Joining with `by = join_by(word)`
#bigram
usw %>% select(activity_description, fiscal_year) %>%
unnest_tokens(Bigram, activity_description, token = "ngrams", n = 2) %>%
separate(Bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,!word2 %in% stop_words$word) %>%
unite(Bigram, word1, word2, sep = " ") %>%
filter(Bigram != "https t.co") %>%
count(Bigram, fiscal_year, sort = TRUE) %>%
mutate(Bigram = reorder_within(Bigram, n, fiscal_year)) %>%
slice_max(n, n = 30) %>%
ggplot(aes(
x = fct_reorder(Bigram, n),
y = n,
fill = Bigram
)) +
geom_col() +
coord_flip() +
facet_wrap(vars(fiscal_year), scales = "free_y", ncol = 2) +
scale_x_reordered() +
scale_x_reordered() +
theme(legend.position = "none") +
labs(
title = "The most words used (Bigram)",
subtitle = "Bar plot and facet wrap, The most words used (Bigram)",
caption = "US aid program 2001-2022",
x = "Bigram",
y = "Count"
)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.