Question 1
Income.Inequality.US <- read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/incomeInequality.csv")
head(Income.Inequality.US)
## X Year Number.thousands quintile1 quintile2 median quintile3 quintile4
## 1 35 1947 37237 14243 22984 26764.14 31166 44223
## 2 36 1948 38624 13779 22655 26177.63 30248 42196
## 3 37 1949 39303 13007 22221 25809.28 29977 42361
## 4 38 1950 39929 13829 23779 27432.35 31647 43986
## 5 39 1951 40578 15070 24569 28258.04 32501 44376
## 6 40 1952 40832 15557 25165 29270.59 34046 46049
## p95 P90 P95 P99 P99.5 P99.9 P99.99 realGDP.M GDP.Deflator
## 1 72583 41477 54172 134415 203001 479022 1584506 2034674 12.90
## 2 68714 43459 58911 136814 210991 504131 1687123 2119102 13.63
## 3 67996 43923 58971 130037 197117 467824 1539131 2107551 13.62
## 4 71728 48465 61084 147986 230773 565506 1582089 2291213 13.75
## 5 69547 48655 61967 148725 222276 530408 1780264 2475821 14.68
## 6 71646 51928 65318 144459 218713 497284 1559300 2576630 15.00
## PopulationK realGDPperCap P95IRSvsCensus personsPerFamily realGDPperFamily
## 1 144126 14117.32 0.7463456 3.870505 54641.17
## 2 146631 14451.94 0.8573362 3.796370 54864.91
## 3 149188 14126.81 0.8672716 3.795843 53623.13
## 4 151684 15105.17 0.8516061 3.798843 57382.18
## 5 154287 16046.85 0.8910090 3.802233 61013.86
## 6 156954 16416.47 0.9116769 3.843897 63103.20
## mean.median
## 1 2.041581
## 2 2.095870
## 3 2.077669
## 4 2.091770
## 5 2.159168
## 6 2.155857
str(Income.Inequality.US)
## 'data.frame': 66 obs. of 23 variables:
## $ X : int 35 36 37 38 39 40 41 42 43 44 ...
## $ Year : int 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 ...
## $ Number.thousands: int 37237 38624 39303 39929 40578 40832 41202 41951 42889 43497 ...
## $ quintile1 : int 14243 13779 13007 13829 15070 15557 16100 15010 16643 17989 ...
## $ quintile2 : int 22984 22655 22221 23779 24569 25165 27322 26402 28325 30274 ...
## $ median : num 26764 26178 25809 27432 28258 ...
## $ quintile3 : int 31166 30248 29977 31647 32501 34046 36663 35742 38081 40161 ...
## $ quintile4 : int 44223 42196 42361 43986 44376 46049 49376 48969 51577 54454 ...
## $ p95 : int 72583 68714 67996 71728 69547 71646 76788 77601 79467 84396 ...
## $ P90 : int 41477 43459 43923 48465 48655 51928 53287 53441 58502 60261 ...
## $ P95 : int 54172 58911 58971 61084 61967 65318 66868 68546 75494 77500 ...
## $ P99 : int 134415 136814 130037 147986 148725 144459 142592 150457 162683 163383 ...
## $ P99.5 : int 203001 210991 197117 230773 222276 218713 211860 222702 242823 246756 ...
## $ P99.9 : int 479022 504131 467824 565506 530408 497284 465173 519497 572363 585061 ...
## $ P99.99 : int 1584506 1687123 1539131 1582089 1780264 1559300 1400826 1652847 1892250 1826274 ...
## $ realGDP.M : num 2034674 2119102 2107551 2291213 2475821 ...
## $ GDP.Deflator : num 12.9 13.6 13.6 13.8 14.7 ...
## $ PopulationK : int 144126 146631 149188 151684 154287 156954 159565 162391 165275 168221 ...
## $ realGDPperCap : num 14117 14452 14127 15105 16047 ...
## $ P95IRSvsCensus : num 0.746 0.857 0.867 0.852 0.891 ...
## $ personsPerFamily: num 3.87 3.8 3.8 3.8 3.8 ...
## $ realGDPperFamily: num 54641 54865 53623 57382 61014 ...
## $ mean.median : num 2.04 2.1 2.08 2.09 2.16 ...
class(Income.Inequality.US)
## [1] "data.frame"
dim(Income.Inequality.US)
## [1] 66 23
#Summary and standard deviation (sd) gets me everything that I need for Question 1, getting me Mean, Standard Deviation and Quintiles for each variable within the data set.
#In addition to this, the n_missing column shows missing data (if any)
summary(Income.Inequality.US)
## X Year Number.thousands quintile1
## Min. : 35.00 Min. :1947 Min. :37237 Min. :13007
## 1st Qu.: 51.25 1st Qu.:1963 1st Qu.:47644 1st Qu.:20492
## Median : 67.50 Median :1980 Median :59930 Median :27511
## Mean : 67.50 Mean :1980 Mean :59379 Mean :25038
## 3rd Qu.: 83.75 3rd Qu.:1996 3rd Qu.:70080 3rd Qu.:28591
## Max. :100.00 Max. :2012 Max. :80944 Max. :32000
## quintile2 median quintile3 quintile4
## Min. :22221 Min. :25809 Min. :29977 Min. : 42196
## 1st Qu.:34485 1st Qu.:40360 1st Qu.:47236 1st Qu.: 65556
## Median :46140 Median :54863 Median :65094 Median : 91970
## Mean :42778 Mean :51276 Mean :61500 Mean : 88610
## 3rd Qu.:49720 3rd Qu.:60867 3rd Qu.:74153 3rd Qu.:109339
## Max. :54827 Max. :67480 Max. :83054 Max. :124734
## p95 P90 P95 P99
## Min. : 67996 Min. : 41477 Min. : 54172 Min. :130037
## 1st Qu.:100058 1st Qu.: 73590 1st Qu.: 93446 1st Qu.:188694
## Median :146697 Median : 94248 Median :120700 Median :229680
## Mean :146542 Mean : 89723 Mean :119020 Mean :256446
## 3rd Qu.:186141 3rd Qu.:105095 3rd Qu.:141942 3rd Qu.:318603
## Max. :218395 Max. :121396 Max. :172037 Max. :441720
## P99.5 P99.9 P99.99 realGDP.M
## Min. :197117 Min. : 465173 Min. : 1400826 Min. : 2034674
## 1st Qu.:275863 1st Qu.: 612491 1st Qu.: 1921804 1st Qu.: 3757205
## Median :326630 Median : 738316 Median : 2955926 Median : 6774510
## Mean :380626 Mean : 998978 Mean : 4480481 Mean : 7733636
## 3rd Qu.:482267 3rd Qu.:1324486 3rd Qu.: 6696022 3rd Qu.:10976748
## Max. :699934 Max. :2273381 Max. :12708323 Max. :16245782
## GDP.Deflator PopulationK realGDPperCap P95IRSvsCensus
## Min. : 12.90 Min. :144126 Min. :14117 Min. :0.7320
## 1st Qu.: 18.20 1st Qu.:189957 1st Qu.:19777 1st Qu.:0.7637
## Median : 42.65 Median :226416 Median :29621 Median :0.8179
## Mean : 48.63 Mean :228259 Mean :31306 Mean :0.8314
## 3rd Qu.: 76.44 3rd Qu.:268933 3rd Qu.:40814 3rd Qu.:0.8906
## Max. :105.01 Max. :314278 Max. :51781 Max. :0.9892
## personsPerFamily realGDPperFamily mean.median
## Min. :3.722 Min. : 53623 Min. :1.846
## 1st Qu.:3.799 1st Qu.: 78854 1st Qu.:1.956
## Median :3.845 Median :112961 Median :2.096
## Mean :3.850 Mean :120230 Mean :2.282
## 3rd Qu.:3.891 3rd Qu.:156624 3rd Qu.:2.577
## Max. :4.006 Max. :200704 Max. :3.251
sd(Income.Inequality.US$quintile1)
## [1] 5424.794
sd(Income.Inequality.US$quintile2)
## [1] 9700.415
sd(Income.Inequality.US$P99.99)
## [1] 3155025
#Looking specifically at the variables I want to evaluate you can see that the historical mean of Quintile 1 income is incredible low, even compared to
#the next quintile 2. Looking at the standard deviation of some of the higher income groups show massive variation within standard deviations, specifically among
#the .01 percentile (99.99).
Question 2 - Data Wrangling
#Here I am taking the four highest income percentiles in the US and adding them into one variable
TopIncomeEarners <- rowSums(cbind(Income.Inequality.US$P99, Income.Inequality.US$P99.5, Income.Inequality.US$P99.9, Income.Inequality.US$P99.99), na.rm=TRUE)
LowestIncomeEarners <- rowSums(cbind(Income.Inequality.US$quintile1, Income.Inequality.US$quintile2, Income.Inequality.US$quintile3, Income.Inequality.US$quintile4), na.rm=TRUE)
#After getting the combined variable I am adding it into the old dataset
Income.Inequality.US <- cbind(Income.Inequality.US, TopIncomeEarners)
Income.Inequality.US <- cbind(Income.Inequality.US, LowestIncomeEarners)
#Subset of the data, pulling the quintiles, years and top income earners I want to evaluate
Income.Inequality.US <- Income.Inequality.US[Income.Inequality.US$Year>=1950, c("Year", "LowestIncomeEarners", "TopIncomeEarners", "quintile1", "quintile2", "quintile3", "quintile4")]
head(Income.Inequality.US)
## Year LowestIncomeEarners TopIncomeEarners quintile1 quintile2 quintile3
## 4 1950 113241 2526354 13829 23779 31647
## 5 1951 116516 2681673 15070 24569 32501
## 6 1952 120817 2419756 15557 25165 34046
## 7 1953 129461 2220451 16100 27322 36663
## 8 1954 126123 2545503 15010 26402 35742
## 9 1955 134626 2870119 16643 28325 38081
## quintile4
## 4 43986
## 5 44376
## 6 46049
## 7 49376
## 8 48969
## 9 51577
Question 3 - Histogram, Line Graph (Comparative), Scatter plot, ggarrange (This contains 5 graphs I wanted to highlight for analysis)
#Histogram
#PLOT #1 : Histograms
library(ggplot2)
library(ggpubr)
Q1.Income.Hist <- ggplot() +
geom_histogram(data = Income.Inequality.US, mapping = aes(x = LowestIncomeEarners,
color = "black", fill="white", title = "Q1 Income"))
## Warning in geom_histogram(data = Income.Inequality.US, mapping = aes(x =
## LowestIncomeEarners, : Ignoring unknown aesthetics: title
Q1.Income.Hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#PLOT #2 : Line Graph; Options is to change from scientific form (1.01 e^5) to a readable figure
options(scipen = 999)
Highest.Income.Line <- ggplot() +
geom_line(data = Income.Inequality.US, mapping = aes(x = Year, y = TopIncomeEarners), color = "darkseagreen3") +
geom_point(data = Income.Inequality.US, mapping = aes(x = Year, y = TopIncomeEarners), color = "darkorange") +
labs(x = "Years 1950-2012", y = "Top Income Earners",
title = "Income Top")
Lowest.Income.Line <- ggplot() +
geom_line(data = Income.Inequality.US, mapping = aes(x = Year, y = LowestIncomeEarners), color = "brown1") +
geom_point(data = Income.Inequality.US, mapping = aes(x = Year, y = LowestIncomeEarners), color = "coral") +
labs(x = "Years 1950-2012", y = "Lowest Income Earners",
title = "Income Lowest")
#PLOT #3 scatterplot
Highest.Income.Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=TopIncomeEarners)) +
geom_point(color="#69b3a2")+
geom_smooth(method = lm, color="red", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Highest Income Earners",
title = "Income Top")
Highest.Income.Scatter
## `geom_smooth()` using formula = 'y ~ x'
Lowest.Income.Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=LowestIncomeEarners)) +
geom_point(color="brown1")+
geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Lowest Income Earners",
title = "Income Lowest")
Lowest.Income.Scatter
## `geom_smooth()` using formula = 'y ~ x'
Quintile1Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile1)) +
geom_point(color="brown1")+
geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Q1 Income",
title = "Income Q1")
Quintile1Scatter
## `geom_smooth()` using formula = 'y ~ x'
Quintile2Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile2)) +
geom_point(color="brown1")+
geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Q2 Income",
title = "Income Q2")
Quintile2Scatter
## `geom_smooth()` using formula = 'y ~ x'
Quintile3Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile3)) +
geom_point(color="brown1")+
geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Q3 Income",
title = "Income Q3")
Quintile3Scatter
## `geom_smooth()` using formula = 'y ~ x'
Quintile4Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile4)) +
geom_point(color="brown1")+
geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
labs(x = "Years 1950-2012", y = "Q4 Income",
title = "Income Q4")
Quintile4Scatter
## `geom_smooth()` using formula = 'y ~ x'
#Utilizing the ggpubR funtion ggarrange to make a matrix with the five graphs I want to feature for the analysis
Final.Analysis <- ggarrange(Quintile1Scatter, Quintile2Scatter, Highest.Income.Scatter, Highest.Income.Line, Lowest.Income.Line,
labels = c("A", "B", "C", "D", "E"),
ncol = 3, nrow = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Final.Analysis
Conclusion
#This data exhibits a trend that the poor and middle income US citizens have faced over the last 40 years
#As seen in exhibit A, B and C there is a drastic difference in income growth between the highest earners and lowest quartiles,
#showing just how much ground middle and lower class america lost during the last 40 years compared to the highest earners.
#What is specifically important is the flattening of income growth of Q1 and Q2 starting around 1970. Overall this is just scratching the surface
#of inequality in America. Namely, Wealth inequality being the largest culprit against the lower and middle income classes.
BONUS
git.inequalityData <- read.csv("https://raw.githubusercontent.com/jonburns2454/USMacroData/master/incomeInequality.csv")
head(git.inequalityData)
## X Year Number.thousands quintile1 quintile2 median quintile3 quintile4
## 1 35 1947 37237 14243 22984 26764.14 31166 44223
## 2 36 1948 38624 13779 22655 26177.63 30248 42196
## 3 37 1949 39303 13007 22221 25809.28 29977 42361
## 4 38 1950 39929 13829 23779 27432.35 31647 43986
## 5 39 1951 40578 15070 24569 28258.04 32501 44376
## 6 40 1952 40832 15557 25165 29270.59 34046 46049
## p95 P90 P95 P99 P99.5 P99.9 P99.99 realGDP.M GDP.Deflator
## 1 72583 41477 54172 134415 203001 479022 1584506 2034674 12.90
## 2 68714 43459 58911 136814 210991 504131 1687123 2119102 13.63
## 3 67996 43923 58971 130037 197117 467824 1539131 2107551 13.62
## 4 71728 48465 61084 147986 230773 565506 1582089 2291213 13.75
## 5 69547 48655 61967 148725 222276 530408 1780264 2475821 14.68
## 6 71646 51928 65318 144459 218713 497284 1559300 2576630 15.00
## PopulationK realGDPperCap P95IRSvsCensus personsPerFamily realGDPperFamily
## 1 144126 14117.32 0.7463456 3.870505 54641.17
## 2 146631 14451.94 0.8573362 3.796370 54864.91
## 3 149188 14126.81 0.8672716 3.795843 53623.13
## 4 151684 15105.17 0.8516061 3.798843 57382.18
## 5 154287 16046.85 0.8910090 3.802233 61013.86
## 6 156954 16416.47 0.9116769 3.843897 63103.20
## mean.median
## 1 2.041581
## 2 2.095870
## 3 2.077669
## 4 2.091770
## 5 2.159168
## 6 2.155857