library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
NYS_Tax_Wide = read_csv("https://raw.githubusercontent.com/professorfoy/DATA-607/main/NYS_Tax.csv",show_col_types = FALSE)
# Start tidying the data
glimpse(NYS_Tax_Wide)
## Rows: 1,864
## Columns: 22
## $ `Tax Year` <dbl> 2011, 2011, 20~
## $ `Resident Type` <chr> "Full-Year Res~
## $ `Place of Residence` <chr> "New York City~
## $ Country <chr> "United States~
## $ State <chr> "New York", "N~
## $ County <chr> "Bronx", "King~
## $ Disclosure <lgl> NA, NA, NA, NA~
## $ `Number of All Returns` <dbl> 588674, 107852~
## $ `NY AGI of All Returns (in thousands) *` <dbl> 17626369, 4814~
## $ `Tax Liability of All Returns (in thousands) **` <dbl> 433593, 198945~
## $ `Number of Taxable Returns` <dbl> 299820, 636200~
## $ `NY AGI of Taxable Returns (in thousands) *` <dbl> 14719525, 4430~
## $ `Tax Liability of Taxable Returns (in thousands) **` <dbl> 598802, 223998~
## $ `Number of Nontaxable Returns` <dbl> 288854, 442320~
## $ `NY AGI of Nontaxable Returns (in thousands) *` <dbl> 2906845, 38414~
## $ `Tax Liability of Nontaxable Returns (in thousands) **` <dbl> -165209, -2505~
## $ `Average NY AGI of All Returns` <dbl> 29942, 44640, ~
## $ `Average Tax of All Returns` <dbl> 737, 1845, 117~
## $ `Average NY AGI of Taxable Returns` <dbl> 49095, 69639, ~
## $ `Average Tax of Taxable Returns` <dbl> 1997, 3521, 16~
## $ `Average NY AGI of Nontaxable Returns` <dbl> 10063, 8685, -~
## $ `County Sort Order` <dbl> 1, 2, 3, 4, 5,~
view(NYS_Tax_Wide)
# remove unneeded rows
NYS_Tax_Wide1 <- select(NYS_Tax_Wide, c('Tax Year', 'Resident Type', 'State', 'County', 'Average NY AGI of All Returns', 'Average NY AGI of Nontaxable Returns'))
NYS_Tax_Wide1 <- rename(NYS_Tax_Wide1, Year = 'Tax Year', Residency_Type = 'Resident Type')
NYS_Tax_Wide1
## # A tibble: 1,864 x 6
## Year Residency_Type State County `Average NY AGI ~ `Average NY AGI ~
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2011 Full-Year Resident New York Bronx 29942 10063
## 2 2011 Full-Year Resident New York Kings 44640 8685
## 3 2011 Full-Year Resident New York Manhat~ 159697 -13802
## 4 2011 Full-Year Resident New York Queens 40108 9290
## 5 2011 Full-Year Resident New York Richmo~ 55928 6921
## 6 2011 Full-Year Resident New York Total,~ 66766 5384
## 7 2011 Full-Year Resident New York Albany 54574 7420
## 8 2011 Full-Year Resident New York Allega~ 36479 9845
## 9 2011 Full-Year Resident New York Broome 41132 7900
## 10 2011 Full-Year Resident New York Cattar~ 35669 9225
## # ... with 1,854 more rows
str(NYS_Tax_Wide1)
## tibble [1,864 x 6] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:1864] 2011 2011 2011 2011 2011 ...
## $ Residency_Type : chr [1:1864] "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" ...
## $ State : chr [1:1864] "New York" "New York" "New York" "New York" ...
## $ County : chr [1:1864] "Bronx" "Kings" "Manhattan" "Queens" ...
## $ Average NY AGI of All Returns : num [1:1864] 29942 44640 159697 40108 55928 ...
## $ Average NY AGI of Nontaxable Returns: num [1:1864] 10063 8685 -13802 9290 6921 ...
view(NYS_Tax_Wide1)
# remove unneeded rows
NYS_Tax_Wide2 <- NYS_Tax_Wide1 %>%
filter(State=="New York") %>%
filter(Year == 2014)
NYS_Tax_Wide2
## # A tibble: 67 x 6
## Year Residency_Type State County `Average NY AGI~ `Average NY AGI~
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2014 Full-Year Resident New York Monroe 53674 8739
## 2 2014 Full-Year Resident New York Washington 40444 9402
## 3 2014 Full-Year Resident New York Chautauqua 39557 10038
## 4 2014 Full-Year Resident New York Tompkins 58247 8644
## 5 2014 Full-Year Resident New York Lewis 40295 9853
## 6 2014 Full-Year Resident New York Seneca 40401 10591
## 7 2014 Full-Year Resident New York Yates 39568 11526
## 8 2014 Full-Year Resident New York Hamilton 36986 5573
## 9 2014 Full-Year Resident New York Ulster 51479 6839
## 10 2014 Full-Year Resident New York Sullivan 43570 8542
## # ... with 57 more rows
str(NYS_Tax_Wide2)
## tibble [67 x 6] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:67] 2014 2014 2014 2014 2014 ...
## $ Residency_Type : chr [1:67] "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" ...
## $ State : chr [1:67] "New York" "New York" "New York" "New York" ...
## $ County : chr [1:67] "Monroe" "Washington" "Chautauqua" "Tompkins" ...
## $ Average NY AGI of All Returns : num [1:67] 53674 40444 39557 58247 40295 ...
## $ Average NY AGI of Nontaxable Returns: num [1:67] 8739 9402 10038 8644 9853 ...
view(NYS_Tax_Wide2)
NYS_Tax_Long <- NYS_Tax_Wide2 %>% gather('NY AGI', 'Dollar Amount', -c('Year', 'Residency_Type', 'State', 'County'))
NYS_Tax_Long$`Dollar Amount` <- as.numeric(NYS_Tax_Long$`Dollar Amount`)
NYS_Tax_Long <- rename(NYS_Tax_Long, Amount = 'Dollar Amount')
NYS_Tax_Long
## # A tibble: 134 x 6
## Year Residency_Type State County `NY AGI` Amount
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2014 Full-Year Resident New York Monroe Average NY AGI of All Re~ 53674
## 2 2014 Full-Year Resident New York Washington Average NY AGI of All Re~ 40444
## 3 2014 Full-Year Resident New York Chautauqua Average NY AGI of All Re~ 39557
## 4 2014 Full-Year Resident New York Tompkins Average NY AGI of All Re~ 58247
## 5 2014 Full-Year Resident New York Lewis Average NY AGI of All Re~ 40295
## 6 2014 Full-Year Resident New York Seneca Average NY AGI of All Re~ 40401
## 7 2014 Full-Year Resident New York Yates Average NY AGI of All Re~ 39568
## 8 2014 Full-Year Resident New York Hamilton Average NY AGI of All Re~ 36986
## 9 2014 Full-Year Resident New York Ulster Average NY AGI of All Re~ 51479
## 10 2014 Full-Year Resident New York Sullivan Average NY AGI of All Re~ 43570
## # ... with 124 more rows
str(NYS_Tax_Long)
## tibble [134 x 6] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:134] 2014 2014 2014 2014 2014 ...
## $ Residency_Type: chr [1:134] "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" ...
## $ State : chr [1:134] "New York" "New York" "New York" "New York" ...
## $ County : chr [1:134] "Monroe" "Washington" "Chautauqua" "Tompkins" ...
## $ NY AGI : chr [1:134] "Average NY AGI of All Returns" "Average NY AGI of All Returns" "Average NY AGI of All Returns" "Average NY AGI of All Returns" ...
## $ Amount : num [1:134] 53674 40444 39557 58247 40295 ...
view(NYS_Tax_Long)
NYS_Tax_Long1 <- NYS_Tax_Long %>% group_by(Amount) %>%
arrange(desc(Amount),.by_group = TRUE)
NYS_Tax_Long1
## # A tibble: 134 x 6
## # Groups: Amount [133]
## Year Residency_Type State County `NY AGI` Amount
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2014 Full-Year Resident New York Manhattan Average NY AGI~ -8964
## 2 2014 Full-Year Resident New York Nassau Average NY AGI~ -220
## 3 2014 Full-Year Resident New York Warren Average NY AGI~ 2551
## 4 2014 Full-Year Resident New York Westchester Average NY AGI~ 2858
## 5 2014 Full-Year Resident New York Essex Average NY AGI~ 5289
## 6 2014 Full-Year Resident New York Hamilton Average NY AGI~ 5573
## 7 2014 Full-Year Resident New York Residence Unknown ++ Average NY AGI~ 5584
## 8 2014 Full-Year Resident New York Saratoga Average NY AGI~ 5680
## 9 2014 Full-Year Resident New York Rockland Average NY AGI~ 5890
## 10 2014 Full-Year Resident New York Total, New York City Average NY AGI~ 6379
## # ... with 124 more rows
str(NYS_Tax_Long1)
## grouped_df [134 x 6] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Year : num [1:134] 2014 2014 2014 2014 2014 ...
## $ Residency_Type: chr [1:134] "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" "Full-Year Resident" ...
## $ State : chr [1:134] "New York" "New York" "New York" "New York" ...
## $ County : chr [1:134] "Manhattan" "Nassau" "Warren" "Westchester" ...
## $ NY AGI : chr [1:134] "Average NY AGI of Nontaxable Returns" "Average NY AGI of Nontaxable Returns" "Average NY AGI of Nontaxable Returns" "Average NY AGI of Nontaxable Returns" ...
## $ Amount : num [1:134] -8964 -220 2551 2858 5289 ...
## - attr(*, "groups")= tibble [133 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ Amount: num [1:133] -8964 -220 2551 2858 5289 ...
## ..$ .rows : list<int> [1:133]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int [1:2] 51 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..$ : int 59
## .. ..$ : int 60
## .. ..$ : int 61
## .. ..$ : int 62
## .. ..$ : int 63
## .. ..$ : int 64
## .. ..$ : int 65
## .. ..$ : int 66
## .. ..$ : int 67
## .. ..$ : int 68
## .. ..$ : int 69
## .. ..$ : int 70
## .. ..$ : int 71
## .. ..$ : int 72
## .. ..$ : int 73
## .. ..$ : int 74
## .. ..$ : int 75
## .. ..$ : int 76
## .. ..$ : int 77
## .. ..$ : int 78
## .. ..$ : int 79
## .. ..$ : int 80
## .. ..$ : int 81
## .. ..$ : int 82
## .. ..$ : int 83
## .. ..$ : int 84
## .. ..$ : int 85
## .. ..$ : int 86
## .. ..$ : int 87
## .. ..$ : int 88
## .. ..$ : int 89
## .. ..$ : int 90
## .. ..$ : int 91
## .. ..$ : int 92
## .. ..$ : int 93
## .. ..$ : int 94
## .. ..$ : int 95
## .. ..$ : int 96
## .. ..$ : int 97
## .. ..$ : int 98
## .. ..$ : int 99
## .. ..$ : int 100
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
view(NYS_Tax_Long1)
Projections = read_csv("https://raw.githubusercontent.com/professorfoy/DATA-607/main/Short-term_Industry_Projections.csv",show_col_types = FALSE)
glimpse(Projections)
## Rows: 169
## Columns: 8
## $ Area <chr> "New York State", "New York State~
## $ Period <chr> "2020-2022", "2020-2022", "2020-2~
## $ `Industry Code` <chr> "'000000", "'000067", "'000670", ~
## $ `Industry Title` <chr> "Total All Industries", "Self Emp~
## $ `Base Year Employment Estimate` <dbl> 8678780, 513590, 513590, 513590, ~
## $ `Projected Year Employment Estimate` <dbl> 10156560, 513590, 513590, 513590,~
## $ `Net Change` <dbl> 1477780, 0, 0, 0, 0, 160110, 720,~
## $ `Annual Growth Rate` <dbl> 17.0, 0.0, 0.0, 0.0, 0.0, 22.5, 1~
view(Projections)
# remove unneeded rows and rename variables.
Projections1 <- select(Projections, -c(1, 2, 3))
Projections1 <- rename(Projections1, Industry_Title = 'Industry Title',
Base_Year_Employment_Estimate = 'Base Year Employment Estimate',
Projected_Year_Employment_Estimate = 'Projected Year Employment Estimate',
Net_Change = 'Net Change',
Annual_Growth_Rate = 'Annual Growth Rate')
Projections1
## # A tibble: 169 x 5
## Industry_Title Base_Year_Emplo~ Projected_Year_~ Net_Change Annual_Growth_R~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Total All Indu~ 8678780 10156560 1477780 17
## 2 Self Employed ~ 513590 513590 0 0
## 3 Self Employed ~ 513590 513590 0 0
## 4 Total Self Emp~ 513590 513590 0 0
## 5 Self Employed ~ 513590 513590 0 0
## 6 Goods Producing 710210 870320 160110 22.5
## 7 Natural Resour~ 36940 37660 720 1.9
## 8 Construction 299430 388950 89520 29.9
## 9 Manufacturing 373830 443710 69880 18.7
## 10 Services Provi~ 7454990 8772660 1317670 17.7
## # ... with 159 more rows
str(Projections1)
## tibble [169 x 5] (S3: tbl_df/tbl/data.frame)
## $ Industry_Title : chr [1:169] "Total All Industries" "Self Employed and Unpaid Family Workers, All Jobs" "Self Employed and Unpaid Family Workers, All Jobs" "Total Self Employed and Unpaid Family Workers, All Jobs" ...
## $ Base_Year_Employment_Estimate : num [1:169] 8678780 513590 513590 513590 513590 ...
## $ Projected_Year_Employment_Estimate: num [1:169] 10156560 513590 513590 513590 513590 ...
## $ Net_Change : num [1:169] 1477780 0 0 0 0 ...
## $ Annual_Growth_Rate : num [1:169] 17 0 0 0 0 22.5 1.9 29.9 18.7 17.7 ...
view(Projections1)
Projections2 <- distinct(Projections1)
Projections2 %>% slice(-c(1))
## # A tibble: 153 x 5
## Industry_Title Base_Year_Emplo~ Projected_Year_~ Net_Change Annual_Growth_R~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Self Employed ~ 513590 513590 0 0
## 2 Total Self Emp~ 513590 513590 0 0
## 3 Self Employed ~ 513590 513590 0 0
## 4 Goods Producing 710210 870320 160110 22.5
## 5 Natural Resour~ 36940 37660 720 1.9
## 6 Construction 299430 388950 89520 29.9
## 7 Manufacturing 373830 443710 69880 18.7
## 8 Services Provi~ 7454990 8772660 1317670 17.7
## 9 Trade, Transpo~ 1242780 1551410 308630 24.8
## 10 Information 260830 281890 21060 8.1
## # ... with 143 more rows
str(Projections2)
## tibble [154 x 5] (S3: tbl_df/tbl/data.frame)
## $ Industry_Title : chr [1:154] "Total All Industries" "Self Employed and Unpaid Family Workers, All Jobs" "Total Self Employed and Unpaid Family Workers, All Jobs" "Self Employed Workers, All Jobs" ...
## $ Base_Year_Employment_Estimate : num [1:154] 8678780 513590 513590 513590 710210 ...
## $ Projected_Year_Employment_Estimate: num [1:154] 10156560 513590 513590 513590 870320 ...
## $ Net_Change : num [1:154] 1477780 0 0 0 160110 ...
## $ Annual_Growth_Rate : num [1:154] 17 0 0 0 22.5 1.9 29.9 18.7 17.7 24.8 ...
view(Projections2)
Projections3 <- Projections2 %>% slice(-c(1, 3, 4, 151))
str(Projections3)
## tibble [150 x 5] (S3: tbl_df/tbl/data.frame)
## $ Industry_Title : chr [1:150] "Self Employed and Unpaid Family Workers, All Jobs" "Goods Producing" "Natural Resources and Mining" "Construction" ...
## $ Base_Year_Employment_Estimate : num [1:150] 513590 710210 36940 299430 373830 ...
## $ Projected_Year_Employment_Estimate: num [1:150] 513590 870320 37660 388950 443710 ...
## $ Net_Change : num [1:150] 0 160110 720 89520 69880 ...
## $ Annual_Growth_Rate : num [1:150] 0 22.5 1.9 29.9 18.7 17.7 24.8 8.1 3.8 15 ...
view(Projections3)
Projections4 <- Projections3 %>%
arrange(desc(Annual_Growth_Rate)) %>%
top_n(15)
## Selecting by Annual_Growth_Rate
view(Projections4)
Projections5 <- Projections4 %>%
mutate(Industry_Title = fct_reorder(Industry_Title, Annual_Growth_Rate)) %>%
ggplot(aes(x=Industry_Title, y=Annual_Growth_Rate)) +
geom_bar(stat="identity", fill="dodgerblue4", alpha=.6, width=.4) +
coord_flip()
Projections5
#### To contrast the lowest growth industries, I just took the bottom 15 industries. I did this by creating a new object Projections_Wide6, and just called the bottom 15 rows.
Projections6 <- Projections3 %>%
arrange(Annual_Growth_Rate) %>%
top_n(-15)
## Selecting by Annual_Growth_Rate
view(Projections6)
Projections7 <- Projections6 %>%
mutate(Industry_Title = fct_reorder(Industry_Title, desc(Annual_Growth_Rate))) %>%
ggplot(aes(x=Industry_Title, y=Annual_Growth_Rate)) +
geom_bar(stat="identity", fill="violetred4", alpha=.6, width=.4) +
coord_flip()
Projections7
#### Conclusion: Surprisingly, Amusement Parks and Arcades, as well as Scenic and Sightseeing Transportation have the highest estimated growth rate. Also surprising is that Telecommunications has a very negative outlook.
Wealth = read_csv("https://raw.githubusercontent.com/professorfoy/DATA-607/main/Top%20Wealth%20Distribution%20of%20Males%20in%202018%20-%20IRS%20Data.csv", show_col_types = FALSE, skip = 5)
## New names:
## * `` -> ...2
## * `` -> ...4
## * `` -> ...6
## * `` -> ...8
## * `` -> ...10
## * ...
glimpse(Wealth)
## Rows: 16
## Columns: 50
## $ Age <chr> NA, NA, NA, "Total", "Under 50", "5~
## $ ...2 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ `Total assets [1]` <chr> NA, "Number", "(1)", "456,850", "15~
## $ ...4 <chr> NA, "Amount", "(2)", "6,558,225", "~
## $ `Debts and mortgages` <chr> NA, "Number", "(3)", "333,190", "11~
## $ ...6 <chr> NA, "Amount", "(4)", "475,560", "12~
## $ `Net worth` <chr> NA, "Number", "(5)", "456,850", "15~
## $ ...8 <chr> NA, "Amount", "(6)", "6,082,665", "~
## $ `Personal residence` <chr> NA, "Number", "(7)", "327,445", "99~
## $ ...10 <chr> NA, "Amount", "(8)", "321,522", "88~
## $ `Other real estate` <chr> NA, "Number", "(9)", "265,165", "61~
## $ ...12 <chr> NA, "Amount", "(10)", "490,849", "9~
## $ `Closely held stock` <chr> NA, "Number", "(11)", "172,271", "5~
## $ ...14 <chr> NA, "Amount", "(12)", "1,072,850", ~
## $ `Publicly traded stock` <chr> NA, "Number", "(13)", "351,669", "1~
## $ ...16 <chr> NA, "Amount", "(14)", "1,209,736", ~
## $ `State and local government bonds` <chr> NA, "Number", "(15)", "187,504", "5~
## $ ...18 <chr> NA, "Amount", "(16)", "295,322", "6~
## $ `Federal bonds` <chr> NA, "Number", "(17)", "94,297", "31~
## $ ...20 <chr> NA, "Amount", "(18)", "33,541", "4,~
## $ `Corporate and foreign bonds` <chr> NA, "Number", "(19)", "151,809", "4~
## $ ...22 <chr> NA, "Amount", "(20)", "65,357", "23~
## $ `Bond funds` <chr> NA, "Number", "(21)", "103,352", "3~
## $ ...24 <chr> NA, "Amount", "(22)", "32,964", "12~
## $ `Diversified mutual funds` <chr> NA, "Number", "(23)", "203,446", "7~
## $ ...26 <chr> NA, "Amount", "(24)", "58,049", "18~
## $ `Unallocated investments` <chr> NA, "Number", "(25)", "58,346", "9,~
## $ ...28 <chr> NA, "Amount", "(26)", "45,232", "14~
## $ `Cash assets` <chr> NA, "Number", "(27)", "440,001", "1~
## $ ...30 <chr> NA, "Amount", "(28)", "453,502", "9~
## $ `Mortgages and notes` <chr> NA, "Number", "(29)", "132,194", "3~
## $ ...32 <chr> NA, "Amount", "(30)", "194,422", "3~
## $ `Cash value life insurance` <chr> NA, "Number", "(31)", "159,872", "4~
## $ ...34 <chr> NA, "Amount", "(32)", "34,298", "8,~
## $ `Noncorporate business assets` <chr> NA, "Number", "(33)", "227,502", "7~
## $ ...36 <chr> NA, "Amount", "(34)", "831,948", "2~
## $ `Farm assets` <chr> NA, "Number", "(35)", "58,459", "11~
## $ ...38 <chr> NA, "Amount", "(36)", "240,540", "5~
## $ `Private equity and hedge funds` <chr> NA, "Number", "(37)", "56,690", "20~
## $ ...40 <chr> NA, "Amount", "(38)", "267,348", "1~
## $ `Other limited partnerships` <chr> NA, "Number", "(39)", "118,330", "3~
## $ ...42 <chr> NA, "Amount", "(40)", "222,438", "4~
## $ `Retirement assets` <chr> NA, "Number", "(41)", "348,951", "1~
## $ ...44 <chr> NA, "Amount", "(42)", "500,949", "1~
## $ Art <chr> NA, "Number", "(43)", "46,392", "9,~
## $ ...46 <chr> NA, "Amount", "(44)", "40,910", "1,~
## $ `Other assets` <chr> NA, "Number", "(45)", "405,279", "1~
## $ ...48 <chr> NA, "Amount", "(46)", "146,448", "3~
## $ ...49 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ ...50 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,~
view(Wealth)
Wealth2 <- Wealth[-c(1,3, 11:16),]
tibble(Wealth2)
## # A tibble: 8 x 50
## Age ...2 `Total assets [1~ ...4 `Debts and mort~ ...6 `Net worth` ...8
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> NA Number Amount Number Amou~ Number Amou~
## 2 Total NA 456,850 6,558~ 333,190 475,~ 456,850 6,08~
## 3 Under~ NA 151,891 1,615~ 119,388 128,~ 151,891 1,48~
## 4 50 un~ NA 88,669 1,224~ 67,750 175,~ 88,669 1,04~
## 5 60 un~ NA 92,718 1,662~ 65,792 74,0~ 92,718 1,58~
## 6 70 un~ NA 68,877 1,125~ 45,048 54,0~ 68,877 1,07~
## 7 80 un~ NA 41,656 686,2~ 26,536 33,1~ 41,656 653,~
## 8 90 an~ NA 13,040 244,5~ 8,676 10,2~ 13,040 234,~
## # ... with 42 more variables: Personal residence <chr>, ...10 <chr>,
## # Other real estate <chr>, ...12 <chr>, Closely held stock <chr>,
## # ...14 <chr>, Publicly traded stock <chr>, ...16 <chr>,
## # State and local government bonds <chr>, ...18 <chr>, Federal bonds <chr>,
## # ...20 <chr>, Corporate and foreign bonds <chr>, ...22 <chr>,
## # Bond funds <chr>, ...24 <chr>, Diversified mutual funds <chr>, ...26 <chr>,
## # Unallocated investments <chr>, ...28 <chr>, Cash assets <chr>, ...
Wealth2 <- select(Wealth2,-'...2', -'...49', -'...50')
Wealth2
## # A tibble: 8 x 47
## Age `Total assets [1~ ...4 `Debts and mort~ ...6 `Net worth` ...8
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> Number Amount Number Amou~ Number Amou~
## 2 Total 456,850 6,558~ 333,190 475,~ 456,850 6,08~
## 3 Under 50 151,891 1,615~ 119,388 128,~ 151,891 1,48~
## 4 50 under 60 88,669 1,224~ 67,750 175,~ 88,669 1,04~
## 5 60 under 70 92,718 1,662~ 65,792 74,0~ 92,718 1,58~
## 6 70 under 80 68,877 1,125~ 45,048 54,0~ 68,877 1,07~
## 7 80 under 90 41,656 686,2~ 26,536 33,1~ 41,656 653,~
## 8 90 and older 13,040 244,5~ 8,676 10,2~ 13,040 234,~
## # ... with 40 more variables: Personal residence <chr>, ...10 <chr>,
## # Other real estate <chr>, ...12 <chr>, Closely held stock <chr>,
## # ...14 <chr>, Publicly traded stock <chr>, ...16 <chr>,
## # State and local government bonds <chr>, ...18 <chr>, Federal bonds <chr>,
## # ...20 <chr>, Corporate and foreign bonds <chr>, ...22 <chr>,
## # Bond funds <chr>, ...24 <chr>, Diversified mutual funds <chr>, ...26 <chr>,
## # Unallocated investments <chr>, ...28 <chr>, Cash assets <chr>, ...
# Attribution to Cliff Lee for helping me fine tune this code block.
swap_columns <- function(data_frame, start_column_number, end_column_number) {
for (i in seq(start_column_number, end_column_number, 2)) {
# Swap the column names.
temp_name <- colnames(data_frame)[i]
colnames(data_frame)[i] <- colnames(data_frame)[i+1]
colnames(data_frame)[i+1] <- temp_name
}
# Return the newly changed data_frame back to the caller.
return (data_frame)
}
Wealth3 <- swap_columns(Wealth2, 2, 48)
Wealth3
## # A tibble: 8 x 47
## Age ...4 `Total assets [1~ ...6 `Debts and mort~ ...8 `Net worth` ...10
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> Number Amount Numb~ Amount Numb~ Amount Numb~
## 2 Total 456,8~ 6,558,225 333,~ 475,560 456,~ 6,082,665 327,~
## 3 Under~ 151,8~ 1,615,158 119,~ 128,765 151,~ 1,486,393 99,8~
## 4 50 un~ 88,669 1,224,255 67,7~ 175,320 88,6~ 1,048,935 69,4~
## 5 60 un~ 92,718 1,662,388 65,7~ 74,021 92,7~ 1,588,367 70,2~
## 6 70 un~ 68,877 1,125,565 45,0~ 54,019 68,8~ 1,071,546 51,0~
## 7 80 un~ 41,656 686,279 26,5~ 33,191 41,6~ 653,088 28,9~
## 8 90 an~ 13,040 244,580 8,676 10,243 13,0~ 234,337 7,932
## # ... with 39 more variables: Personal residence <chr>, ...12 <chr>,
## # Other real estate <chr>, ...14 <chr>, Closely held stock <chr>,
## # ...16 <chr>, Publicly traded stock <chr>, ...18 <chr>,
## # State and local government bonds <chr>, ...20 <chr>, Federal bonds <chr>,
## # ...22 <chr>, Corporate and foreign bonds <chr>, ...24 <chr>,
## # Bond funds <chr>, ...26 <chr>, Diversified mutual funds <chr>, ...28 <chr>,
## # Unallocated investments <chr>, ...30 <chr>, Cash assets <chr>, ...
Wealth4 <- Wealth3[,seq(1,ncol(Wealth3),2)]
Wealth4
## # A tibble: 8 x 24
## Age `Total assets [1]` `Debts and mortgages` `Net worth` `Personal resid~
## <chr> <chr> <chr> <chr> <chr>
## 1 <NA> Amount Amount Amount Amount
## 2 Total 6,558,225 475,560 6,082,665 321,522
## 3 Under 50 1,615,158 128,765 1,486,393 88,936
## 4 50 under 60 1,224,255 175,320 1,048,935 73,935
## 5 60 under 70 1,662,388 74,021 1,588,367 63,508
## 6 70 under 80 1,125,565 54,019 1,071,546 53,476
## 7 80 under 90 686,279 33,191 653,088 32,736
## 8 90 and older 244,580 10,243 234,337 8,931
## # ... with 19 more variables: Other real estate <chr>,
## # Closely held stock <chr>, Publicly traded stock <chr>,
## # State and local government bonds <chr>, Federal bonds <chr>,
## # Corporate and foreign bonds <chr>, Bond funds <chr>,
## # Diversified mutual funds <chr>, Unallocated investments <chr>,
## # Cash assets <chr>, Mortgages and notes <chr>,
## # Cash value life insurance <chr>, Noncorporate business assets <chr>, ...
Wealth4 <- select(Wealth4,-'Total assets [1]',-'Net worth') %>%
na.omit()
Wealth4 <-subset(Wealth4, Age!='Total')
Wealth4
## # A tibble: 6 x 22
## Age `Debts and mort~ `Personal resid~ `Other real est~ `Closely held s~
## <chr> <chr> <chr> <chr> <chr>
## 1 Under 50 128,765 88,936 98,961 289,578
## 2 50 under 60 175,320 73,935 91,498 166,772
## 3 60 under 70 74,021 63,508 118,944 389,806
## 4 70 under 80 54,019 53,476 98,253 135,123
## 5 80 under 90 33,191 32,736 67,772 62,793
## 6 90 and older 10,243 8,931 15,422 28,777
## # ... with 17 more variables: Publicly traded stock <chr>,
## # State and local government bonds <chr>, Federal bonds <chr>,
## # Corporate and foreign bonds <chr>, Bond funds <chr>,
## # Diversified mutual funds <chr>, Unallocated investments <chr>,
## # Cash assets <chr>, Mortgages and notes <chr>,
## # Cash value life insurance <chr>, Noncorporate business assets <chr>,
## # Farm assets <chr>, Private equity and hedge funds <chr>, ...
Wealth5 <- select(Wealth4, 1, 5:6)
Wealth5
## # A tibble: 6 x 3
## Age `Closely held stock` `Publicly traded stock`
## <chr> <chr> <chr>
## 1 Under 50 289,578 293,889
## 2 50 under 60 166,772 202,204
## 3 60 under 70 389,806 259,606
## 4 70 under 80 135,123 227,558
## 5 80 under 90 62,793 161,315
## 6 90 and older 28,777 65,164
Wealth_Long <- melt(Wealth5, id.vars=c("Age"))
Wealth_Long
## Age variable value
## 1 Under 50 Closely held stock 289,578
## 2 50 under 60 Closely held stock 166,772
## 3 60 under 70 Closely held stock 389,806
## 4 70 under 80 Closely held stock 135,123
## 5 80 under 90 Closely held stock 62,793
## 6 90 and older Closely held stock 28,777
## 7 Under 50 Publicly traded stock 293,889
## 8 50 under 60 Publicly traded stock 202,204
## 9 60 under 70 Publicly traded stock 259,606
## 10 70 under 80 Publicly traded stock 227,558
## 11 80 under 90 Publicly traded stock 161,315
## 12 90 and older Publicly traded stock 65,164
as.factor(Wealth_Long$Age)
## [1] Under 50 50 under 60 60 under 70 70 under 80 80 under 90
## [6] 90 and older Under 50 50 under 60 60 under 70 70 under 80
## [11] 80 under 90 90 and older
## 6 Levels: 50 under 60 60 under 70 70 under 80 80 under 90 ... Under 50
ggplot(Wealth_Long, aes(x=variable, y=(value), fill=Age)) +
geom_bar(stat="identity", position=position_dodge())
#### Conclusion: This dataset was more untidy then the first two datasets, which required more extensive tidying. This data could have been analyzed in a variety of ways, but for manageablity, I just took two variables and stratefied them by age. However, I could not get ggplot to order the y-axis from 0 to the highest total when grouping age groups.