library(readxl)
adult_income_data <- read_excel("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.xlsx")
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Import the libraries needed to run these notes.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
view(adult_income_data)
# select numeric columns
num_data <- adult_income_data %>%
select(age, edunum, capitalgain, capitalloss, hoursperweek)
# Compute summary statistics
summary_stats <- summary(num_data)
print(summary_stats)
## age edunum capitalgain capitalloss
## Min. :17.00 Min. : 1.00 Min. : 0 Min. : 0.0
## 1st Qu.:28.00 1st Qu.: 9.00 1st Qu.: 0 1st Qu.: 0.0
## Median :37.00 Median :10.00 Median : 0 Median : 0.0
## Mean :38.77 Mean :10.07 Mean : 1082 Mean : 87.9
## 3rd Qu.:48.00 3rd Qu.:12.00 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :90.00 Max. :16.00 Max. :99999 Max. :3770.0
## hoursperweek
## Min. : 1.00
## 1st Qu.:40.00
## Median :40.00
## Mean :40.39
## 3rd Qu.:45.00
## Max. :99.00
# Select categorical columns
cat_data <- adult_income_data %>%
select(workclass, education, maritalstatus, occupation, relationship, race, sex, nativecountry)
# Get unique values and counts for each categorical column
cat_summaries <- lapply(cat_data, function(x) {
data.frame(Unique_Values = unique(x), Counts = table(x))
})
print(cat_summaries)
## $workclass
## Unique_Values Counts.x Counts.Freq
## 1 Private ? 963
## 2 Local-gov Federal-gov 472
## 3 ? Local-gov 1043
## 4 Self-emp-not-inc Never-worked 3
## 5 Federal-gov Private 11210
## 6 State-gov Self-emp-inc 579
## 7 Self-emp-inc Self-emp-not-inc 1321
## 8 Without-pay State-gov 683
## 9 Never-worked Without-pay 7
##
## $education
## Unique_Values Counts.x Counts.Freq
## 1 11th 10th 456
## 2 HS-grad 11th 637
## 3 Assoc-acdm 12th 224
## 4 Some-college 1st-4th 79
## 5 10th 5th-6th 176
## 6 Prof-school 7th-8th 309
## 7 7th-8th 9th 242
## 8 Bachelors Assoc-acdm 534
## 9 Masters Assoc-voc 679
## 10 Doctorate Bachelors 2670
## 11 5th-6th Doctorate 181
## 12 Assoc-voc HS-grad 5283
## 13 9th Masters 934
## 14 12th Preschool 32
## 15 1st-4th Prof-school 258
## 16 Preschool Some-college 3587
##
## $maritalstatus
## Unique_Values Counts.x Counts.Freq
## 1 Never-married Divorced 2190
## 2 Married-civ-spouse Married-AF-spouse 14
## 3 Widowed Married-civ-spouse 7403
## 4 Divorced Married-spouse-absent 210
## 5 Separated Never-married 5434
## 6 Married-spouse-absent Separated 505
## 7 Married-AF-spouse Widowed 525
##
## $occupation
## Unique_Values Counts.x Counts.Freq
## 1 Machine-op-inspct ? 966
## 2 Farming-fishing Adm-clerical 1841
## 3 Protective-serv Armed-Forces 6
## 4 ? Craft-repair 2013
## 5 Other-service Exec-managerial 2020
## 6 Prof-specialty Farming-fishing 496
## 7 Craft-repair Handlers-cleaners 702
## 8 Adm-clerical Machine-op-inspct 1020
## 9 Exec-managerial Other-service 1628
## 10 Tech-support Priv-house-serv 93
## 11 Sales Prof-specialty 2032
## 12 Priv-house-serv Protective-serv 334
## 13 Transport-moving Sales 1854
## 14 Handlers-cleaners Tech-support 518
## 15 Armed-Forces Transport-moving 758
##
## $relationship
## Unique_Values Counts.x Counts.Freq
## 1 Own-child Husband 6523
## 2 Husband Not-in-family 4278
## 3 Not-in-family Other-relative 525
## 4 Unmarried Own-child 2513
## 5 Wife Unmarried 1679
## 6 Other-relative Wife 763
##
## $race
## Unique_Values Counts.x Counts.Freq
## 1 Black Amer-Indian-Eskimo 159
## 2 White Asian-Pac-Islander 480
## 3 Asian-Pac-Islander Black 1561
## 4 Other Other 135
## 5 Amer-Indian-Eskimo White 13946
##
## $sex
## Unique_Values Counts.x Counts.Freq
## 1 Male Female 5421
## 2 Female Male 10860
##
## $nativecountry
## Unique_Values Counts.x Counts.Freq
## 1 United-States ? 274
## 2 ? Cambodia 9
## 3 Peru Canada 61
## 4 Guatemala China 47
## 5 Mexico Columbia 26
## 6 Dominican-Republic Cuba 43
## 7 Ireland Dominican-Republic 33
## 8 Germany Ecuador 17
## 9 Philippines El-Salvador 49
## 10 Thailand England 37
## 11 Haiti France 9
## 12 El-Salvador Germany 69
## 13 Puerto-Rico Greece 20
## 14 Vietnam Guatemala 24
## 15 South Haiti 31
## 16 Columbia Honduras 7
## 17 Japan Hong 10
## 18 India Hungary 6
## 19 Cambodia India 51
## 20 Poland Iran 16
## 21 Laos Ireland 13
## 22 England Italy 32
## 23 Cuba Jamaica 25
## 24 Taiwan Japan 30
## 25 Italy Laos 5
## 26 Canada Mexico 308
## 27 Portugal Nicaragua 15
## 28 China Outlying-US(Guam-USVI-etc) 9
## 29 Nicaragua Peru 15
## 30 Honduras Philippines 97
## 31 Iran Poland 27
## 32 Scotland Portugal 30
## 33 Jamaica Puerto-Rico 70
## 34 Ecuador Scotland 9
## 35 Yugoslavia South 35
## 36 Hungary Taiwan 14
## 37 Hong Thailand 12
## 38 Greece Trinadad&Tobago 8
## 39 Trinadad&Tobago United-States 14662
## 40 Outlying-US(Guam-USVI-etc) Vietnam 19
## 41 France Yugoslavia 7
summary(adult_income_data)
## age workclass fnlwgt education
## Min. :17.00 Length:16281 Min. : 13492 Length:16281
## 1st Qu.:28.00 Class :character 1st Qu.: 116736 Class :character
## Median :37.00 Mode :character Median : 177831 Mode :character
## Mean :38.77 Mean : 189436
## 3rd Qu.:48.00 3rd Qu.: 238384
## Max. :90.00 Max. :1490400
## edunum maritalstatus occupation relationship
## Min. : 1.00 Length:16281 Length:16281 Length:16281
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.07
## 3rd Qu.:12.00
## Max. :16.00
## race sex capitalgain capitalloss
## Length:16281 Length:16281 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1082 Mean : 87.9
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :3770.0
## hoursperweek nativecountry income
## Min. : 1.00 Length:16281 Length:16281
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.39
## 3rd Qu.:45.00
## Max. :99.00
The purpose of this dataset is to predict whether income exceeds $50K/yr based on census data.
age: continuous. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. fnlwgt: continuous. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. education-num: continuous. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. sex: Female, Male. capital-gain: continuous. capital-loss: continuous. hours-per-week: continuous. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
stdDev <- sd(adult_income_data$age, na.rm = TRUE)
print(stdDev)
## [1] 13.84919
totalCapitalGain = sum(adult_income_data$capitalgain, na.rm = TRUE)
print(totalCapitalGain)
## [1] 17614497
var <- var(adult_income_data$edunum, na.rm= TRUE)
print(var)
## [1] 6.592289
ggplot(adult_income_data, aes(x = sex, y = age, fill = sex)) +
geom_boxplot() +
labs(title = "Box Plots of Age Across Categories in Sex")
ggplot(adult_income_data, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "red", color = "black") +
labs(title = "Distribution of age", x = "Age", y = "No. of People")
correlation <- cor(adult_income_data$age, adult_income_data$capitalgain)
ggplot(adult_income_data, aes(x = age, y = capitalgain)) +
geom_point() +
labs(title = paste("Scatterplot of Age vs. CapitalGain (Correlation =", round(correlation, 2), ")"),
x = "Age", y = "CapitalGain")
ggplot(adult_income_data, aes(x = education, fill = occupation)) +
geom_bar() +
labs(title = "Interactions between education and occupation",
x = "Education", y = "Occupation")
ggplot(adult_income_data, aes(x = age, fill = relationship)) +
geom_histogram(binwidth = 5) +
labs(title = "Distribution of age by relationship", x = "age", y = "relationship") +
scale_fill_brewer(palette = "Set2")
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.