This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
adult_income_data <- read.csv("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.csv")
summary(adult_income_data)
## age workclass fnlwgt education
## Min. :17.00 Length:16281 Min. : 13492 Length:16281
## 1st Qu.:28.00 Class :character 1st Qu.: 116736 Class :character
## Median :37.00 Mode :character Median : 177831 Mode :character
## Mean :38.77 Mean : 189436
## 3rd Qu.:48.00 3rd Qu.: 238384
## Max. :90.00 Max. :1490400
## edunum maritalstatus occupation relationship
## Min. : 1.00 Length:16281 Length:16281 Length:16281
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.07
## 3rd Qu.:12.00
## Max. :16.00
## race sex capitalgain capitalloss
## Length:16281 Length:16281 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1082 Mean : 87.9
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :3770.0
## hoursperweek nativecountry income
## Min. : 1.00 Length:16281 Length:16281
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.39
## 3rd Qu.:45.00
## Max. :99.00
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# Group 1: Group by occupation and average of hours per week
data_group <- adult_income_data %>%
group_by(occupation) %>%
summarise(avg_hrs_week = mean(hoursperweek),
Maximum = max(hoursperweek))
print(data_group)
## # A tibble: 15 × 3
## occupation avg_hrs_week Maximum
## <chr> <dbl> <int>
## 1 " ?" 31.6 99
## 2 " Adm-clerical" 38.0 99
## 3 " Armed-Forces" 43 50
## 4 " Craft-repair" 42.2 99
## 5 " Exec-managerial" 45.0 99
## 6 " Farming-fishing" 46.5 99
## 7 " Handlers-cleaners" 37.8 99
## 8 " Machine-op-inspct" 40.8 86
## 9 " Other-service" 34.9 99
## 10 " Priv-house-serv" 33.2 99
## 11 " Prof-specialty" 42.1 99
## 12 " Protective-serv" 42.6 99
## 13 " Sales" 40.7 99
## 14 " Tech-support" 40.2 84
## 15 " Transport-moving" 44.9 99
summary(data_group)
## occupation avg_hrs_week Maximum
## Length:15 Min. :31.60 Min. :50.00
## Class :character 1st Qu.:37.93 1st Qu.:99.00
## Mode :character Median :40.82 Median :99.00
## Mean :40.23 Mean :93.87
## 3rd Qu.:42.82 3rd Qu.:99.00
## Max. :46.47 Max. :99.00
# Visualization of group 1 data set.
ggplot(data_group, aes(occupation, avg_hrs_week)) + geom_col(fill='orange') +
labs(title = "Avg hours worked per week by diffrent occupations") +
theme(axis.text.x = element_text(angle = 90))
# Probability of Group 1 data set
data_group_prob <- data_group %>%
mutate(prob_1= avg_hrs_week / sum(avg_hrs_week))
View(data_group_prob)
# Group 2: Group by Education and average hours worked per week
data_group_2 <- adult_income_data %>%
group_by(education) %>%
summarise(avg_hrs_week = mean(hoursperweek),
Maximum = max(hoursperweek))
print(data_group_2)
## # A tibble: 16 × 3
## education avg_hrs_week Maximum
## <chr> <dbl> <int>
## 1 " 10th" 36.9 99
## 2 " 11th" 34.0 99
## 3 " 12th" 34.6 70
## 4 " 1st-4th" 39.8 70
## 5 " 5th-6th" 39.0 99
## 6 " 7th-8th" 38.2 99
## 7 " 9th" 39.0 96
## 8 " Assoc-acdm" 41.4 99
## 9 " Assoc-voc" 41.8 99
## 10 " Bachelors" 42.2 99
## 11 " Doctorate" 45.7 99
## 12 " HS-grad" 40.8 99
## 13 " Masters" 43.1 99
## 14 " Preschool" 36.4 60
## 15 " Prof-school" 47.9 99
## 16 " Some-college" 38.9 99
# Visualization of Group 2 data set using scatter plot.
ggplot(data_group_2, aes(x = education, y = avg_hrs_week)) +
geom_point() +
labs(title = "Scatter Plot for education and average hours worked by each group",
x = "Education",
y = "Avg_hrs_per_week") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Probability of Group 2 data set
data_group_2_prob <- data_group_2 %>%
mutate(prob_2= avg_hrs_week / sum(avg_hrs_week))
View(data_group_2_prob)
# Group 3: Group by Race and average capital gain
data_group_3 <- adult_income_data %>%
group_by(race) %>%
summarise(capital_gain = mean(capitalgain),
Maximum = max(capitalgain))
print(data_group_3)
## # A tibble: 5 × 3
## race capital_gain Maximum
## <chr> <dbl> <int>
## 1 " Amer-Indian-Eskimo" 370. 27828
## 2 " Asian-Pac-Islander" 1665. 99999
## 3 " Black" 545. 99999
## 4 " Other" 1081. 99999
## 5 " White" 1130. 99999
# Visualization of Group 3 data set
ggplot(data_group_3, aes(x = race, y = capital_gain)) +
geom_col(fill = "purple") +
labs(
title = "Line Graph of race vs capital gain",
x = "Race",
y = "Capital Gain"
)
# Probability of Group 3 data set.
data_group_3_prob <- data_group_3 %>%
mutate(prob_3= capital_gain / sum(capital_gain))
View(data_group_3_prob)
min_data_group_prob <- data_group_prob %>%
filter(prob_1 == min(prob_1)) %>%
mutate(Anomaly_1 ="Targeted Prob anomaly")
summary(min_data_group_prob)
## occupation avg_hrs_week Maximum prob_1
## Length:1 Min. :31.6 Min. :99 Min. :0.05237
## Class :character 1st Qu.:31.6 1st Qu.:99 1st Qu.:0.05237
## Mode :character Median :31.6 Median :99 Median :0.05237
## Mean :31.6 Mean :99 Mean :0.05237
## 3rd Qu.:31.6 3rd Qu.:99 3rd Qu.:0.05237
## Max. :31.6 Max. :99 Max. :0.05237
## Anomaly_1
## Length:1
## Class :character
## Mode :character
##
##
##
min_data_group_prob_2 <- data_group_2_prob %>%
filter(prob_2 == min(prob_2)) %>%
mutate(Anomaly_2 ="Targeted Prob anomaly")
summary(min_data_group_prob_2)
## education avg_hrs_week Maximum prob_2
## Length:1 Min. :34 Min. :99 Min. :0.05315
## Class :character 1st Qu.:34 1st Qu.:99 1st Qu.:0.05315
## Mode :character Median :34 Median :99 Median :0.05315
## Mean :34 Mean :99 Mean :0.05315
## 3rd Qu.:34 3rd Qu.:99 3rd Qu.:0.05315
## Max. :34 Max. :99 Max. :0.05315
## Anomaly_2
## Length:1
## Class :character
## Mode :character
##
##
##
min_data_group_prob_3 <- data_group_3_prob %>%
filter(prob_3 == min(prob_3)) %>%
mutate(Anomaly_3 ="Targeted Prob anomaly")
summary(min_data_group_prob_3)
## race capital_gain Maximum prob_3
## Length:1 Min. :370.2 Min. :27828 Min. :0.07728
## Class :character 1st Qu.:370.2 1st Qu.:27828 1st Qu.:0.07728
## Mode :character Median :370.2 Median :27828 Median :0.07728
## Mean :370.2 Mean :27828 Mean :0.07728
## 3rd Qu.:370.2 3rd Qu.:27828 3rd Qu.:0.07728
## Max. :370.2 Max. :27828 Max. :0.07728
## Anomaly_3
## Length:1
## Class :character
## Mode :character
##
##
##
data <- adult_income_data %>%
left_join(min_data_group_prob, by="occupation") %>%
left_join(min_data_group_prob_2, by="education") %>%
left_join(min_data_group_prob_3, by="race")
data$Anomaly_1[is.na(data$Anomaly_1)] <- "Not Anomaly"
data$Anomaly_2[is.na(data$Anomaly_2)] <- "Not Anomaly"
data$Anomaly_3[is.na(data$Anomaly_3)] <- "Not Anomaly"
summary(data)
## age workclass fnlwgt education
## Min. :17.00 Length:16281 Min. : 13492 Length:16281
## 1st Qu.:28.00 Class :character 1st Qu.: 116736 Class :character
## Median :37.00 Mode :character Median : 177831 Mode :character
## Mean :38.77 Mean : 189436
## 3rd Qu.:48.00 3rd Qu.: 238384
## Max. :90.00 Max. :1490400
##
## edunum maritalstatus occupation relationship
## Min. : 1.00 Length:16281 Length:16281 Length:16281
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.07
## 3rd Qu.:12.00
## Max. :16.00
##
## race sex capitalgain capitalloss
## Length:16281 Length:16281 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1082 Mean : 87.9
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :3770.0
##
## hoursperweek nativecountry income avg_hrs_week.x
## Min. : 1.00 Length:16281 Length:16281 Min. :31.6
## 1st Qu.:40.00 Class :character Class :character 1st Qu.:31.6
## Median :40.00 Mode :character Mode :character Median :31.6
## Mean :40.39 Mean :31.6
## 3rd Qu.:45.00 3rd Qu.:31.6
## Max. :99.00 Max. :31.6
## NA's :15315
## Maximum.x prob_1 Anomaly_1 avg_hrs_week.y
## Min. :99 Min. :0.052 Length:16281 Min. :34
## 1st Qu.:99 1st Qu.:0.052 Class :character 1st Qu.:34
## Median :99 Median :0.052 Mode :character Median :34
## Mean :99 Mean :0.052 Mean :34
## 3rd Qu.:99 3rd Qu.:0.052 3rd Qu.:34
## Max. :99 Max. :0.052 Max. :34
## NA's :15315 NA's :15315 NA's :15644
## Maximum.y prob_2 Anomaly_2 capital_gain
## Min. :99 Min. :0.053 Length:16281 Min. :370.2
## 1st Qu.:99 1st Qu.:0.053 Class :character 1st Qu.:370.2
## Median :99 Median :0.053 Mode :character Median :370.2
## Mean :99 Mean :0.053 Mean :370.2
## 3rd Qu.:99 3rd Qu.:0.053 3rd Qu.:370.2
## Max. :99 Max. :0.053 Max. :370.2
## NA's :15644 NA's :15644 NA's :16122
## Maximum prob_3 Anomaly_3
## Min. :27828 Min. :0.077 Length:16281
## 1st Qu.:27828 1st Qu.:0.077 Class :character
## Median :27828 Median :0.077 Mode :character
## Mean :27828 Mean :0.077
## 3rd Qu.:27828 3rd Qu.:0.077
## Max. :27828 Max. :0.077
## NA's :16122 NA's :16122
# Generate all combinations of categorical variables
data <- adult_income_data
combinations <-expand.grid(cat1 = unique(data$occupation), cat2 = unique(data$education), cat3 = unique(data$race))
combination_counts <- data %>%
group_by(occupation,education,race) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'occupation', 'education'. You can override
## using the `.groups` argument.
View(combination_counts)
least_common_combo <- combination_counts %>%
filter(Count == min(Count))
cat("\nleast Common Combinations:\n")
##
## least Common Combinations:
print(least_common_combo)
## # A tibble: 258 × 4
## # Groups: occupation, education [209]
## occupation education race Count
## <chr> <chr> <chr> <int>
## 1 " ?" " 10th" " Amer-Indian-Eskimo" 1
## 2 " ?" " 11th" " Amer-Indian-Eskimo" 1
## 3 " ?" " 11th" " Asian-Pac-Islander" 1
## 4 " ?" " 12th" " Black" 4
## 5 " ?" " 1st-4th" " Black" 2
## 6 " ?" " 5th-6th" " Black" 2
## 7 " ?" " 7th-8th" " Black" 2
## 8 " ?" " 9th" " Black" 3
## 9 " ?" " Assoc-acdm" " Black" 2
## 10 " ?" " Assoc-voc" " Asian-Pac-Islander" 1
## # ℹ 248 more rows
library(ggplot2)
ggplot(combination_counts, aes(x = combination_counts$occupation, y = combination_counts$Count))+ geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
facet_wrap(~combination_counts$education) +
labs(
x = "occupation",
y = "Count",
fill = "education",
title = "Grouped Barplot for occupation, count and education"
) +
theme_minimal() + scale_fill_discrete() + theme(legend.position = "top") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(combination_counts, aes(x = combination_counts$occupation, y = combination_counts$education, fill = combination_counts$Count)) +
geom_tile() +
scale_fill_gradient(low = "blue", high = "green") +
labs(x = "occupation", y = "education", fill = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(combination_counts, aes(x = combination_counts$race, y = combination_counts$education, fill = combination_counts$Count)) +
geom_tile() +
scale_fill_gradient(low = "orange", high ="white") +
labs(x = "race", y = "education", fill = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
From the whether graphs we can conclude few points like there are more white people who completed the HS grad education compared to others and people who completed the HS Grad study are more in the field of craft reparing occupation. Also most of the people who are doing some job are from mostly HS grad, pre-school and some college degree i.e. there are very few people who completed higher degree like phd. There are some fields in the dataset which has “?” as the instance. This value might not effect the categorical attribute, but definitely effects the numerical variable. This could mislead the analysis by not summarizing and aggregating the values. Hence, plotting against those variable makes no sense.