Question 1

Income.Inequality.US <- read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/incomeInequality.csv")

head(Income.Inequality.US)
##    X Year Number.thousands quintile1 quintile2   median quintile3 quintile4
## 1 35 1947            37237     14243     22984 26764.14     31166     44223
## 2 36 1948            38624     13779     22655 26177.63     30248     42196
## 3 37 1949            39303     13007     22221 25809.28     29977     42361
## 4 38 1950            39929     13829     23779 27432.35     31647     43986
## 5 39 1951            40578     15070     24569 28258.04     32501     44376
## 6 40 1952            40832     15557     25165 29270.59     34046     46049
##     p95   P90   P95    P99  P99.5  P99.9  P99.99 realGDP.M GDP.Deflator
## 1 72583 41477 54172 134415 203001 479022 1584506   2034674        12.90
## 2 68714 43459 58911 136814 210991 504131 1687123   2119102        13.63
## 3 67996 43923 58971 130037 197117 467824 1539131   2107551        13.62
## 4 71728 48465 61084 147986 230773 565506 1582089   2291213        13.75
## 5 69547 48655 61967 148725 222276 530408 1780264   2475821        14.68
## 6 71646 51928 65318 144459 218713 497284 1559300   2576630        15.00
##   PopulationK realGDPperCap P95IRSvsCensus personsPerFamily realGDPperFamily
## 1      144126      14117.32      0.7463456         3.870505         54641.17
## 2      146631      14451.94      0.8573362         3.796370         54864.91
## 3      149188      14126.81      0.8672716         3.795843         53623.13
## 4      151684      15105.17      0.8516061         3.798843         57382.18
## 5      154287      16046.85      0.8910090         3.802233         61013.86
## 6      156954      16416.47      0.9116769         3.843897         63103.20
##   mean.median
## 1    2.041581
## 2    2.095870
## 3    2.077669
## 4    2.091770
## 5    2.159168
## 6    2.155857
str(Income.Inequality.US)
## 'data.frame':    66 obs. of  23 variables:
##  $ X               : int  35 36 37 38 39 40 41 42 43 44 ...
##  $ Year            : int  1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 ...
##  $ Number.thousands: int  37237 38624 39303 39929 40578 40832 41202 41951 42889 43497 ...
##  $ quintile1       : int  14243 13779 13007 13829 15070 15557 16100 15010 16643 17989 ...
##  $ quintile2       : int  22984 22655 22221 23779 24569 25165 27322 26402 28325 30274 ...
##  $ median          : num  26764 26178 25809 27432 28258 ...
##  $ quintile3       : int  31166 30248 29977 31647 32501 34046 36663 35742 38081 40161 ...
##  $ quintile4       : int  44223 42196 42361 43986 44376 46049 49376 48969 51577 54454 ...
##  $ p95             : int  72583 68714 67996 71728 69547 71646 76788 77601 79467 84396 ...
##  $ P90             : int  41477 43459 43923 48465 48655 51928 53287 53441 58502 60261 ...
##  $ P95             : int  54172 58911 58971 61084 61967 65318 66868 68546 75494 77500 ...
##  $ P99             : int  134415 136814 130037 147986 148725 144459 142592 150457 162683 163383 ...
##  $ P99.5           : int  203001 210991 197117 230773 222276 218713 211860 222702 242823 246756 ...
##  $ P99.9           : int  479022 504131 467824 565506 530408 497284 465173 519497 572363 585061 ...
##  $ P99.99          : int  1584506 1687123 1539131 1582089 1780264 1559300 1400826 1652847 1892250 1826274 ...
##  $ realGDP.M       : num  2034674 2119102 2107551 2291213 2475821 ...
##  $ GDP.Deflator    : num  12.9 13.6 13.6 13.8 14.7 ...
##  $ PopulationK     : int  144126 146631 149188 151684 154287 156954 159565 162391 165275 168221 ...
##  $ realGDPperCap   : num  14117 14452 14127 15105 16047 ...
##  $ P95IRSvsCensus  : num  0.746 0.857 0.867 0.852 0.891 ...
##  $ personsPerFamily: num  3.87 3.8 3.8 3.8 3.8 ...
##  $ realGDPperFamily: num  54641 54865 53623 57382 61014 ...
##  $ mean.median     : num  2.04 2.1 2.08 2.09 2.16 ...
class(Income.Inequality.US)
## [1] "data.frame"
dim(Income.Inequality.US)
## [1] 66 23
#Summary and standard deviation (sd) gets me everything that I need for Question 1, getting me Mean, Standard Deviation and Quintiles for each variable within the data set.
#In addition to this, the n_missing column shows missing data (if any)

summary(Income.Inequality.US)
##        X               Year      Number.thousands   quintile1    
##  Min.   : 35.00   Min.   :1947   Min.   :37237    Min.   :13007  
##  1st Qu.: 51.25   1st Qu.:1963   1st Qu.:47644    1st Qu.:20492  
##  Median : 67.50   Median :1980   Median :59930    Median :27511  
##  Mean   : 67.50   Mean   :1980   Mean   :59379    Mean   :25038  
##  3rd Qu.: 83.75   3rd Qu.:1996   3rd Qu.:70080    3rd Qu.:28591  
##  Max.   :100.00   Max.   :2012   Max.   :80944    Max.   :32000  
##    quintile2         median        quintile3       quintile4     
##  Min.   :22221   Min.   :25809   Min.   :29977   Min.   : 42196  
##  1st Qu.:34485   1st Qu.:40360   1st Qu.:47236   1st Qu.: 65556  
##  Median :46140   Median :54863   Median :65094   Median : 91970  
##  Mean   :42778   Mean   :51276   Mean   :61500   Mean   : 88610  
##  3rd Qu.:49720   3rd Qu.:60867   3rd Qu.:74153   3rd Qu.:109339  
##  Max.   :54827   Max.   :67480   Max.   :83054   Max.   :124734  
##       p95              P90              P95              P99        
##  Min.   : 67996   Min.   : 41477   Min.   : 54172   Min.   :130037  
##  1st Qu.:100058   1st Qu.: 73590   1st Qu.: 93446   1st Qu.:188694  
##  Median :146697   Median : 94248   Median :120700   Median :229680  
##  Mean   :146542   Mean   : 89723   Mean   :119020   Mean   :256446  
##  3rd Qu.:186141   3rd Qu.:105095   3rd Qu.:141942   3rd Qu.:318603  
##  Max.   :218395   Max.   :121396   Max.   :172037   Max.   :441720  
##      P99.5            P99.9             P99.99           realGDP.M       
##  Min.   :197117   Min.   : 465173   Min.   : 1400826   Min.   : 2034674  
##  1st Qu.:275863   1st Qu.: 612491   1st Qu.: 1921804   1st Qu.: 3757205  
##  Median :326630   Median : 738316   Median : 2955926   Median : 6774510  
##  Mean   :380626   Mean   : 998978   Mean   : 4480481   Mean   : 7733636  
##  3rd Qu.:482267   3rd Qu.:1324486   3rd Qu.: 6696022   3rd Qu.:10976748  
##  Max.   :699934   Max.   :2273381   Max.   :12708323   Max.   :16245782  
##   GDP.Deflator     PopulationK     realGDPperCap   P95IRSvsCensus  
##  Min.   : 12.90   Min.   :144126   Min.   :14117   Min.   :0.7320  
##  1st Qu.: 18.20   1st Qu.:189957   1st Qu.:19777   1st Qu.:0.7637  
##  Median : 42.65   Median :226416   Median :29621   Median :0.8179  
##  Mean   : 48.63   Mean   :228259   Mean   :31306   Mean   :0.8314  
##  3rd Qu.: 76.44   3rd Qu.:268933   3rd Qu.:40814   3rd Qu.:0.8906  
##  Max.   :105.01   Max.   :314278   Max.   :51781   Max.   :0.9892  
##  personsPerFamily realGDPperFamily  mean.median   
##  Min.   :3.722    Min.   : 53623   Min.   :1.846  
##  1st Qu.:3.799    1st Qu.: 78854   1st Qu.:1.956  
##  Median :3.845    Median :112961   Median :2.096  
##  Mean   :3.850    Mean   :120230   Mean   :2.282  
##  3rd Qu.:3.891    3rd Qu.:156624   3rd Qu.:2.577  
##  Max.   :4.006    Max.   :200704   Max.   :3.251
sd(Income.Inequality.US$quintile1)
## [1] 5424.794
sd(Income.Inequality.US$quintile2)
## [1] 9700.415
sd(Income.Inequality.US$P99.99)
## [1] 3155025
#Looking specifically at the variables I want to evaluate you can see that the historical mean of Quintile 1 income is incredible low, even compared to 
#the next quintile 2. Looking at the standard deviation of some of the higher income groups show massive variation within standard deviations, specifically among
#the .01 percentile (99.99).

Question 2 - Data Wrangling

#Here I am taking the four highest income percentiles in the US and adding them into one variable


TopIncomeEarners <- rowSums(cbind(Income.Inequality.US$P99, Income.Inequality.US$P99.5, Income.Inequality.US$P99.9, Income.Inequality.US$P99.99), na.rm=TRUE)

LowestIncomeEarners <- rowSums(cbind(Income.Inequality.US$quintile1, Income.Inequality.US$quintile2, Income.Inequality.US$quintile3, Income.Inequality.US$quintile4), na.rm=TRUE)

#After getting the combined variable I am adding it into the old dataset

Income.Inequality.US <- cbind(Income.Inequality.US, TopIncomeEarners)

Income.Inequality.US <- cbind(Income.Inequality.US, LowestIncomeEarners)


#Subset of the data, pulling the quintiles, years and top income earners I want to evaluate

Income.Inequality.US <- Income.Inequality.US[Income.Inequality.US$Year>=1950, c("Year", "LowestIncomeEarners", "TopIncomeEarners", "quintile1", "quintile2", "quintile3", "quintile4")]

head(Income.Inequality.US)
##   Year LowestIncomeEarners TopIncomeEarners quintile1 quintile2 quintile3
## 4 1950              113241          2526354     13829     23779     31647
## 5 1951              116516          2681673     15070     24569     32501
## 6 1952              120817          2419756     15557     25165     34046
## 7 1953              129461          2220451     16100     27322     36663
## 8 1954              126123          2545503     15010     26402     35742
## 9 1955              134626          2870119     16643     28325     38081
##   quintile4
## 4     43986
## 5     44376
## 6     46049
## 7     49376
## 8     48969
## 9     51577

Question 3 - Histogram, Line Graph (Comparative), Scatter plot, ggarrange (This contains 5 graphs I wanted to highlight for analysis)

#Histogram

#PLOT #1 :  Histograms
library(ggplot2)
library(ggpubr)

Q1.Income.Hist <- ggplot() +
    geom_histogram(data = Income.Inequality.US, mapping = aes(x = LowestIncomeEarners, 
    color = "black", fill="white", title = "Q1 Income"))
## Warning in geom_histogram(data = Income.Inequality.US, mapping = aes(x =
## LowestIncomeEarners, : Ignoring unknown aesthetics: title
Q1.Income.Hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#PLOT #2 : Line Graph; Options is to change from scientific form (1.01 e^5) to a readable figure
options(scipen = 999)
Highest.Income.Line <- ggplot() +
    geom_line(data = Income.Inequality.US, mapping = aes(x = Year, y = TopIncomeEarners), color = "darkseagreen3") +
    geom_point(data = Income.Inequality.US, mapping = aes(x = Year, y = TopIncomeEarners), color = "darkorange") +
    labs(x = "Years 1950-2012", y = "Top Income Earners", 
    title = "Income Top")

Lowest.Income.Line <- ggplot() +
    geom_line(data = Income.Inequality.US, mapping = aes(x = Year, y = LowestIncomeEarners), color = "brown1") +
    geom_point(data = Income.Inequality.US, mapping = aes(x = Year, y = LowestIncomeEarners), color = "coral") +
    labs(x = "Years 1950-2012", y = "Lowest Income Earners", 
    title = "Income Lowest")



#PLOT #3 scatterplot

Highest.Income.Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=TopIncomeEarners)) +
    geom_point(color="#69b3a2")+ 
    geom_smooth(method = lm, color="red", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Highest Income Earners",
    title = "Income Top")

Highest.Income.Scatter
## `geom_smooth()` using formula = 'y ~ x'

Lowest.Income.Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=LowestIncomeEarners)) +
    geom_point(color="brown1")+ 
    geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Lowest Income Earners",
    title = "Income Lowest")
Lowest.Income.Scatter
## `geom_smooth()` using formula = 'y ~ x'

Quintile1Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile1)) +
    geom_point(color="brown1")+ 
    geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Q1 Income",
     title = "Income Q1")
Quintile1Scatter
## `geom_smooth()` using formula = 'y ~ x'

Quintile2Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile2)) +
    geom_point(color="brown1")+ 
    geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Q2 Income",
    title = "Income Q2")
Quintile2Scatter
## `geom_smooth()` using formula = 'y ~ x'

Quintile3Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile3)) +
    geom_point(color="brown1")+ 
    geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Q3 Income",
    title = "Income Q3")
Quintile3Scatter
## `geom_smooth()` using formula = 'y ~ x'

Quintile4Scatter <- ggplot(Income.Inequality.US, aes(x=Year, y=quintile4)) +
    geom_point(color="brown1")+ 
    geom_smooth(method = lm, color="coral", fill="plum", se = TRUE) +
    labs(x = "Years 1950-2012", y = "Q4 Income",
    title = "Income Q4")
Quintile4Scatter
## `geom_smooth()` using formula = 'y ~ x'

#Utilizing the ggpubR funtion ggarrange to make a matrix with the five graphs I want to feature for the analysis

Final.Analysis <- ggarrange(Quintile1Scatter, Quintile2Scatter, Highest.Income.Scatter, Highest.Income.Line, Lowest.Income.Line, 
                            labels = c("A", "B", "C", "D", "E"),
                            ncol = 3, nrow = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Final.Analysis

Conclusion

#This data exhibits a trend that the poor and middle income US citizens have faced over the last 40 years
#As seen in exhibit A, B and C there is a drastic difference in income growth between the highest earners and lowest quartiles,
#showing just how much ground middle and lower class america lost during the last 40 years compared to the highest earners.
#What is specifically important is the flattening of income growth of Q1 and Q2 starting around 1970. Overall this is just scratching the surface
#of inequality in America. Namely, Wealth inequality being the largest culprit against the lower and middle income classes.

BONUS

git.inequalityData <- read.csv("https://raw.githubusercontent.com/jonburns2454/USMacroData/master/incomeInequality.csv")
head(git.inequalityData)
##    X Year Number.thousands quintile1 quintile2   median quintile3 quintile4
## 1 35 1947            37237     14243     22984 26764.14     31166     44223
## 2 36 1948            38624     13779     22655 26177.63     30248     42196
## 3 37 1949            39303     13007     22221 25809.28     29977     42361
## 4 38 1950            39929     13829     23779 27432.35     31647     43986
## 5 39 1951            40578     15070     24569 28258.04     32501     44376
## 6 40 1952            40832     15557     25165 29270.59     34046     46049
##     p95   P90   P95    P99  P99.5  P99.9  P99.99 realGDP.M GDP.Deflator
## 1 72583 41477 54172 134415 203001 479022 1584506   2034674        12.90
## 2 68714 43459 58911 136814 210991 504131 1687123   2119102        13.63
## 3 67996 43923 58971 130037 197117 467824 1539131   2107551        13.62
## 4 71728 48465 61084 147986 230773 565506 1582089   2291213        13.75
## 5 69547 48655 61967 148725 222276 530408 1780264   2475821        14.68
## 6 71646 51928 65318 144459 218713 497284 1559300   2576630        15.00
##   PopulationK realGDPperCap P95IRSvsCensus personsPerFamily realGDPperFamily
## 1      144126      14117.32      0.7463456         3.870505         54641.17
## 2      146631      14451.94      0.8573362         3.796370         54864.91
## 3      149188      14126.81      0.8672716         3.795843         53623.13
## 4      151684      15105.17      0.8516061         3.798843         57382.18
## 5      154287      16046.85      0.8910090         3.802233         61013.86
## 6      156954      16416.47      0.9116769         3.843897         63103.20
##   mean.median
## 1    2.041581
## 2    2.095870
## 3    2.077669
## 4    2.091770
## 5    2.159168
## 6    2.155857