library(readxl)
adult_income_data <- read_excel("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.xlsx")

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Loading Data

Import the libraries needed to run these notes.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
view(adult_income_data)

Numeric Summary of Data:

# select numeric columns
num_data <- adult_income_data %>%
    select(age, edunum, capitalgain, capitalloss, hoursperweek)

# Compute summary statistics
summary_stats <- summary(num_data)

print(summary_stats)
##       age            edunum       capitalgain     capitalloss    
##  Min.   :17.00   Min.   : 1.00   Min.   :    0   Min.   :   0.0  
##  1st Qu.:28.00   1st Qu.: 9.00   1st Qu.:    0   1st Qu.:   0.0  
##  Median :37.00   Median :10.00   Median :    0   Median :   0.0  
##  Mean   :38.77   Mean   :10.07   Mean   : 1082   Mean   :  87.9  
##  3rd Qu.:48.00   3rd Qu.:12.00   3rd Qu.:    0   3rd Qu.:   0.0  
##  Max.   :90.00   Max.   :16.00   Max.   :99999   Max.   :3770.0  
##   hoursperweek  
##  Min.   : 1.00  
##  1st Qu.:40.00  
##  Median :40.00  
##  Mean   :40.39  
##  3rd Qu.:45.00  
##  Max.   :99.00

Categorical Summary of Data:

# Select categorical columns
cat_data <- adult_income_data %>%
     select(workclass, education, maritalstatus, occupation, relationship, race, sex, nativecountry)

# Get unique values and counts for each categorical column
cat_summaries <- lapply(cat_data, function(x) {
  data.frame(Unique_Values = unique(x), Counts = table(x))
})

print(cat_summaries)
## $workclass
##      Unique_Values         Counts.x Counts.Freq
## 1          Private                ?         963
## 2        Local-gov      Federal-gov         472
## 3                ?        Local-gov        1043
## 4 Self-emp-not-inc     Never-worked           3
## 5      Federal-gov          Private       11210
## 6        State-gov     Self-emp-inc         579
## 7     Self-emp-inc Self-emp-not-inc        1321
## 8      Without-pay        State-gov         683
## 9     Never-worked      Without-pay           7
## 
## $education
##    Unique_Values     Counts.x Counts.Freq
## 1           11th         10th         456
## 2        HS-grad         11th         637
## 3     Assoc-acdm         12th         224
## 4   Some-college      1st-4th          79
## 5           10th      5th-6th         176
## 6    Prof-school      7th-8th         309
## 7        7th-8th          9th         242
## 8      Bachelors   Assoc-acdm         534
## 9        Masters    Assoc-voc         679
## 10     Doctorate    Bachelors        2670
## 11       5th-6th    Doctorate         181
## 12     Assoc-voc      HS-grad        5283
## 13           9th      Masters         934
## 14          12th    Preschool          32
## 15       1st-4th  Prof-school         258
## 16     Preschool Some-college        3587
## 
## $maritalstatus
##           Unique_Values              Counts.x Counts.Freq
## 1         Never-married              Divorced        2190
## 2    Married-civ-spouse     Married-AF-spouse          14
## 3               Widowed    Married-civ-spouse        7403
## 4              Divorced Married-spouse-absent         210
## 5             Separated         Never-married        5434
## 6 Married-spouse-absent             Separated         505
## 7     Married-AF-spouse               Widowed         525
## 
## $occupation
##        Unique_Values          Counts.x Counts.Freq
## 1  Machine-op-inspct                 ?         966
## 2    Farming-fishing      Adm-clerical        1841
## 3    Protective-serv      Armed-Forces           6
## 4                  ?      Craft-repair        2013
## 5      Other-service   Exec-managerial        2020
## 6     Prof-specialty   Farming-fishing         496
## 7       Craft-repair Handlers-cleaners         702
## 8       Adm-clerical Machine-op-inspct        1020
## 9    Exec-managerial     Other-service        1628
## 10      Tech-support   Priv-house-serv          93
## 11             Sales    Prof-specialty        2032
## 12   Priv-house-serv   Protective-serv         334
## 13  Transport-moving             Sales        1854
## 14 Handlers-cleaners      Tech-support         518
## 15      Armed-Forces  Transport-moving         758
## 
## $relationship
##    Unique_Values       Counts.x Counts.Freq
## 1      Own-child        Husband        6523
## 2        Husband  Not-in-family        4278
## 3  Not-in-family Other-relative         525
## 4      Unmarried      Own-child        2513
## 5           Wife      Unmarried        1679
## 6 Other-relative           Wife         763
## 
## $race
##        Unique_Values           Counts.x Counts.Freq
## 1              Black Amer-Indian-Eskimo         159
## 2              White Asian-Pac-Islander         480
## 3 Asian-Pac-Islander              Black        1561
## 4              Other              Other         135
## 5 Amer-Indian-Eskimo              White       13946
## 
## $sex
##   Unique_Values Counts.x Counts.Freq
## 1          Male   Female        5421
## 2        Female     Male       10860
## 
## $nativecountry
##                 Unique_Values                   Counts.x Counts.Freq
## 1               United-States                          ?         274
## 2                           ?                   Cambodia           9
## 3                        Peru                     Canada          61
## 4                   Guatemala                      China          47
## 5                      Mexico                   Columbia          26
## 6          Dominican-Republic                       Cuba          43
## 7                     Ireland         Dominican-Republic          33
## 8                     Germany                    Ecuador          17
## 9                 Philippines                El-Salvador          49
## 10                   Thailand                    England          37
## 11                      Haiti                     France           9
## 12                El-Salvador                    Germany          69
## 13                Puerto-Rico                     Greece          20
## 14                    Vietnam                  Guatemala          24
## 15                      South                      Haiti          31
## 16                   Columbia                   Honduras           7
## 17                      Japan                       Hong          10
## 18                      India                    Hungary           6
## 19                   Cambodia                      India          51
## 20                     Poland                       Iran          16
## 21                       Laos                    Ireland          13
## 22                    England                      Italy          32
## 23                       Cuba                    Jamaica          25
## 24                     Taiwan                      Japan          30
## 25                      Italy                       Laos           5
## 26                     Canada                     Mexico         308
## 27                   Portugal                  Nicaragua          15
## 28                      China Outlying-US(Guam-USVI-etc)           9
## 29                  Nicaragua                       Peru          15
## 30                   Honduras                Philippines          97
## 31                       Iran                     Poland          27
## 32                   Scotland                   Portugal          30
## 33                    Jamaica                Puerto-Rico          70
## 34                    Ecuador                   Scotland           9
## 35                 Yugoslavia                      South          35
## 36                    Hungary                     Taiwan          14
## 37                       Hong                   Thailand          12
## 38                     Greece            Trinadad&Tobago           8
## 39            Trinadad&Tobago              United-States       14662
## 40 Outlying-US(Guam-USVI-etc)                    Vietnam          19
## 41                     France                 Yugoslavia           7

Combined Summary of Data:

summary(adult_income_data)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:16281       Min.   :  13492   Length:16281      
##  1st Qu.:28.00   Class :character   1st Qu.: 116736   Class :character  
##  Median :37.00   Mode  :character   Median : 177831   Mode  :character  
##  Mean   :38.77                      Mean   : 189436                     
##  3rd Qu.:48.00                      3rd Qu.: 238384                     
##  Max.   :90.00                      Max.   :1490400                     
##      edunum      maritalstatus       occupation        relationship      
##  Min.   : 1.00   Length:16281       Length:16281       Length:16281      
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :10.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.07                                                           
##  3rd Qu.:12.00                                                           
##  Max.   :16.00                                                           
##      race               sex             capitalgain     capitalloss    
##  Length:16281       Length:16281       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1082   Mean   :  87.9  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :3770.0  
##   hoursperweek   nativecountry         income         
##  Min.   : 1.00   Length:16281       Length:16281      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.39                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00

Project’s Goal

The purpose of this dataset is to predict whether income exceeds $50K/yr based on census data.

Data Documentation

age: continuous. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. fnlwgt: continuous. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. education-num: continuous. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. sex: Female, Male. capital-gain: continuous. capital-loss: continuous. hours-per-week: continuous. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

Possible Insights:

  1. Explore the relationship between education level and income. Higher education often leads to higher income. 2.Investigate how age is related to income. 3.Examine how different occupations are associated with income. 4.Analyze if the native country of an individual influences their income. This can help identify potential disparities based on nationality.
  2. Investigate if there are income disparities among racial or ethnic groups. This can shed light on potential inequalities.

Aggregate Functions

stdDev <- sd(adult_income_data$age, na.rm = TRUE)
print(stdDev)
## [1] 13.84919
totalCapitalGain = sum(adult_income_data$capitalgain, na.rm = TRUE)
print(totalCapitalGain)
## [1] 17614497
var <- var(adult_income_data$edunum, na.rm= TRUE)
print(var)
## [1] 6.592289

Visual Summary using different plots:

ggplot(adult_income_data, aes(x = sex, y = age, fill = sex)) +
  geom_boxplot() +
  labs(title = "Box Plots of Age Across Categories in Sex")

ggplot(adult_income_data, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "red", color = "black") +
  labs(title = "Distribution of age", x = "Age", y = "No. of People")

correlation <- cor(adult_income_data$age, adult_income_data$capitalgain)
ggplot(adult_income_data, aes(x = age, y = capitalgain)) +
  geom_point() +
  labs(title = paste("Scatterplot of Age vs. CapitalGain (Correlation =", round(correlation, 2), ")"),
       x = "Age", y = "CapitalGain")

ggplot(adult_income_data, aes(x = education, fill = occupation)) +
  geom_bar() +
  labs(title = "Interactions between education and occupation",
       x = "Education", y = "Occupation")

ggplot(adult_income_data, aes(x = age, fill = relationship)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Distribution of eduLevel by HoursPerWeek", x = "Education Level", y = "Hours Per Week") +
  scale_fill_brewer(palette = "Set2")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.