R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
adult_income_data <- read.csv("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.csv")
summary(adult_income_data)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:16281       Min.   :  13492   Length:16281      
##  1st Qu.:28.00   Class :character   1st Qu.: 116736   Class :character  
##  Median :37.00   Mode  :character   Median : 177831   Mode  :character  
##  Mean   :38.77                      Mean   : 189436                     
##  3rd Qu.:48.00                      3rd Qu.: 238384                     
##  Max.   :90.00                      Max.   :1490400                     
##      edunum      maritalstatus       occupation        relationship      
##  Min.   : 1.00   Length:16281       Length:16281       Length:16281      
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :10.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.07                                                           
##  3rd Qu.:12.00                                                           
##  Max.   :16.00                                                           
##      race               sex             capitalgain     capitalloss    
##  Length:16281       Length:16281       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1082   Mean   :  87.9  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :3770.0  
##   hoursperweek   nativecountry         income         
##  Min.   : 1.00   Length:16281       Length:16281      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.39                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

PART - 1

Grouping Data

# Group 1: Group by occupation and average of hours per week
data_group <- adult_income_data %>%
  group_by(occupation) %>%
  summarise(avg_hrs_week = mean(hoursperweek),
            Maximum = max(hoursperweek))
  
print(data_group)
## # A tibble: 15 × 3
##    occupation           avg_hrs_week Maximum
##    <chr>                       <dbl>   <int>
##  1 " ?"                         31.6      99
##  2 " Adm-clerical"              38.0      99
##  3 " Armed-Forces"              43        50
##  4 " Craft-repair"              42.2      99
##  5 " Exec-managerial"           45.0      99
##  6 " Farming-fishing"           46.5      99
##  7 " Handlers-cleaners"         37.8      99
##  8 " Machine-op-inspct"         40.8      86
##  9 " Other-service"             34.9      99
## 10 " Priv-house-serv"           33.2      99
## 11 " Prof-specialty"            42.1      99
## 12 " Protective-serv"           42.6      99
## 13 " Sales"                     40.7      99
## 14 " Tech-support"              40.2      84
## 15 " Transport-moving"          44.9      99
summary(data_group)
##   occupation         avg_hrs_week      Maximum     
##  Length:15          Min.   :31.60   Min.   :50.00  
##  Class :character   1st Qu.:37.93   1st Qu.:99.00  
##  Mode  :character   Median :40.82   Median :99.00  
##                     Mean   :40.23   Mean   :93.87  
##                     3rd Qu.:42.82   3rd Qu.:99.00  
##                     Max.   :46.47   Max.   :99.00
# Visualization of group 1 data set.
ggplot(data_group, aes(occupation, avg_hrs_week)) + geom_col(fill='orange') +
  labs(title = "Avg hours worked per week by diffrent occupations") + 
  theme(axis.text.x = element_text(angle = 90))

# Probability of Group 1 data set
data_group_prob <- data_group %>%
  mutate(prob_1= avg_hrs_week / sum(avg_hrs_week))
View(data_group_prob)
# Group 2: Group by Education and average hours worked per week
data_group_2 <- adult_income_data %>%
  group_by(education) %>%
  summarise(avg_hrs_week = mean(hoursperweek),
            Maximum = max(hoursperweek))
  
print(data_group_2)
## # A tibble: 16 × 3
##    education       avg_hrs_week Maximum
##    <chr>                  <dbl>   <int>
##  1 " 10th"                 36.9      99
##  2 " 11th"                 34.0      99
##  3 " 12th"                 34.6      70
##  4 " 1st-4th"              39.8      70
##  5 " 5th-6th"              39.0      99
##  6 " 7th-8th"              38.2      99
##  7 " 9th"                  39.0      96
##  8 " Assoc-acdm"           41.4      99
##  9 " Assoc-voc"            41.8      99
## 10 " Bachelors"            42.2      99
## 11 " Doctorate"            45.7      99
## 12 " HS-grad"              40.8      99
## 13 " Masters"              43.1      99
## 14 " Preschool"            36.4      60
## 15 " Prof-school"          47.9      99
## 16 " Some-college"         38.9      99
# Visualization of Group 2 data set using scatter plot.
ggplot(data_group_2, aes(x = education, y = avg_hrs_week)) +
  geom_point() +
  labs(title = "Scatter Plot for education and average hours worked by each group",
       x = "Education",
       y = "Avg_hrs_per_week") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Probability of Group 2 data set
data_group_2_prob <- data_group_2 %>%
  mutate(prob_2= avg_hrs_week / sum(avg_hrs_week))
View(data_group_2_prob)
# Group 3: Group by Race and average capital gain
data_group_3 <- adult_income_data %>%
  group_by(race) %>%
  summarise(capital_gain = mean(capitalgain),
            Maximum = max(capitalgain))
  
print(data_group_3)
## # A tibble: 5 × 3
##   race                  capital_gain Maximum
##   <chr>                        <dbl>   <int>
## 1 " Amer-Indian-Eskimo"         370.   27828
## 2 " Asian-Pac-Islander"        1665.   99999
## 3 " Black"                      545.   99999
## 4 " Other"                     1081.   99999
## 5 " White"                     1130.   99999
# Visualization of Group 3 data set
ggplot(data_group_3, aes(x = race, y = capital_gain)) +
  geom_col(fill = "purple") +
  labs(
    title = "Line Graph of race vs capital gain",
    x = "Race",
    y = "Capital Gain"
  )

# Probability of Group 3 data set.
data_group_3_prob <- data_group_3 %>%
  mutate(prob_3= capital_gain / sum(capital_gain))
View(data_group_3_prob)

Lowest probabilities of all three data groups.

min_data_group_prob <- data_group_prob %>%
  filter(prob_1 == min(prob_1)) %>%
  mutate(Anomaly_1 ="Targeted Prob anomaly")
summary(min_data_group_prob)
##   occupation         avg_hrs_week     Maximum       prob_1       
##  Length:1           Min.   :31.6   Min.   :99   Min.   :0.05237  
##  Class :character   1st Qu.:31.6   1st Qu.:99   1st Qu.:0.05237  
##  Mode  :character   Median :31.6   Median :99   Median :0.05237  
##                     Mean   :31.6   Mean   :99   Mean   :0.05237  
##                     3rd Qu.:31.6   3rd Qu.:99   3rd Qu.:0.05237  
##                     Max.   :31.6   Max.   :99   Max.   :0.05237  
##   Anomaly_1        
##  Length:1          
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
min_data_group_prob_2 <- data_group_2_prob %>%
  filter(prob_2 == min(prob_2)) %>%
  mutate(Anomaly_2 ="Targeted Prob anomaly")
summary(min_data_group_prob_2)
##   education          avg_hrs_week    Maximum       prob_2       
##  Length:1           Min.   :34    Min.   :99   Min.   :0.05315  
##  Class :character   1st Qu.:34    1st Qu.:99   1st Qu.:0.05315  
##  Mode  :character   Median :34    Median :99   Median :0.05315  
##                     Mean   :34    Mean   :99   Mean   :0.05315  
##                     3rd Qu.:34    3rd Qu.:99   3rd Qu.:0.05315  
##                     Max.   :34    Max.   :99   Max.   :0.05315  
##   Anomaly_2        
##  Length:1          
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
min_data_group_prob_3 <- data_group_3_prob %>%
  filter(prob_3 == min(prob_3)) %>%
  mutate(Anomaly_3 ="Targeted Prob anomaly")
summary(min_data_group_prob_3)
##      race            capital_gain      Maximum          prob_3       
##  Length:1           Min.   :370.2   Min.   :27828   Min.   :0.07728  
##  Class :character   1st Qu.:370.2   1st Qu.:27828   1st Qu.:0.07728  
##  Mode  :character   Median :370.2   Median :27828   Median :0.07728  
##                     Mean   :370.2   Mean   :27828   Mean   :0.07728  
##                     3rd Qu.:370.2   3rd Qu.:27828   3rd Qu.:0.07728  
##                     Max.   :370.2   Max.   :27828   Max.   :0.07728  
##   Anomaly_3        
##  Length:1          
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Anamoly and combining data set result:

data <- adult_income_data %>%
  left_join(min_data_group_prob, by="occupation") %>%
  left_join(min_data_group_prob_2, by="education") %>%
  left_join(min_data_group_prob_3, by="race") 
data$Anomaly_1[is.na(data$Anomaly_1)] <- "Not Anomaly"
data$Anomaly_2[is.na(data$Anomaly_2)] <- "Not Anomaly"
data$Anomaly_3[is.na(data$Anomaly_3)] <- "Not Anomaly"
summary(data)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:16281       Min.   :  13492   Length:16281      
##  1st Qu.:28.00   Class :character   1st Qu.: 116736   Class :character  
##  Median :37.00   Mode  :character   Median : 177831   Mode  :character  
##  Mean   :38.77                      Mean   : 189436                     
##  3rd Qu.:48.00                      3rd Qu.: 238384                     
##  Max.   :90.00                      Max.   :1490400                     
##                                                                         
##      edunum      maritalstatus       occupation        relationship      
##  Min.   : 1.00   Length:16281       Length:16281       Length:16281      
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :10.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.07                                                           
##  3rd Qu.:12.00                                                           
##  Max.   :16.00                                                           
##                                                                          
##      race               sex             capitalgain     capitalloss    
##  Length:16281       Length:16281       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1082   Mean   :  87.9  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :3770.0  
##                                                                        
##   hoursperweek   nativecountry         income          avg_hrs_week.x 
##  Min.   : 1.00   Length:16281       Length:16281       Min.   :31.6   
##  1st Qu.:40.00   Class :character   Class :character   1st Qu.:31.6   
##  Median :40.00   Mode  :character   Mode  :character   Median :31.6   
##  Mean   :40.39                                         Mean   :31.6   
##  3rd Qu.:45.00                                         3rd Qu.:31.6   
##  Max.   :99.00                                         Max.   :31.6   
##                                                        NA's   :15315  
##    Maximum.x         prob_1       Anomaly_1         avg_hrs_week.y 
##  Min.   :99      Min.   :0.052   Length:16281       Min.   :34     
##  1st Qu.:99      1st Qu.:0.052   Class :character   1st Qu.:34     
##  Median :99      Median :0.052   Mode  :character   Median :34     
##  Mean   :99      Mean   :0.052                      Mean   :34     
##  3rd Qu.:99      3rd Qu.:0.052                      3rd Qu.:34     
##  Max.   :99      Max.   :0.052                      Max.   :34     
##  NA's   :15315   NA's   :15315                      NA's   :15644  
##    Maximum.y         prob_2       Anomaly_2          capital_gain  
##  Min.   :99      Min.   :0.053   Length:16281       Min.   :370.2  
##  1st Qu.:99      1st Qu.:0.053   Class :character   1st Qu.:370.2  
##  Median :99      Median :0.053   Mode  :character   Median :370.2  
##  Mean   :99      Mean   :0.053                      Mean   :370.2  
##  3rd Qu.:99      3rd Qu.:0.053                      3rd Qu.:370.2  
##  Max.   :99      Max.   :0.053                      Max.   :370.2  
##  NA's   :15644   NA's   :15644                      NA's   :16122  
##     Maximum          prob_3       Anomaly_3        
##  Min.   :27828   Min.   :0.077   Length:16281      
##  1st Qu.:27828   1st Qu.:0.077   Class :character  
##  Median :27828   Median :0.077   Mode  :character  
##  Mean   :27828   Mean   :0.077                     
##  3rd Qu.:27828   3rd Qu.:0.077                     
##  Max.   :27828   Max.   :0.077                     
##  NA's   :16122   NA's   :16122

Part - 2:

# Generate all combinations of categorical variables
data <- adult_income_data
combinations <-expand.grid(cat1 = unique(data$occupation), cat2 = unique(data$education), cat3 = unique(data$race))
combination_counts <- data %>%
  group_by(occupation,education,race) %>%
  summarize(Count = n())
## `summarise()` has grouped output by 'occupation', 'education'. You can override
## using the `.groups` argument.
View(combination_counts)
least_common_combo <- combination_counts %>%
  filter(Count == min(Count))
cat("\nleast Common Combinations:\n")
## 
## least Common Combinations:
print(least_common_combo)
## # A tibble: 258 × 4
## # Groups:   occupation, education [209]
##    occupation education     race                  Count
##    <chr>      <chr>         <chr>                 <int>
##  1 " ?"       " 10th"       " Amer-Indian-Eskimo"     1
##  2 " ?"       " 11th"       " Amer-Indian-Eskimo"     1
##  3 " ?"       " 11th"       " Asian-Pac-Islander"     1
##  4 " ?"       " 12th"       " Black"                  4
##  5 " ?"       " 1st-4th"    " Black"                  2
##  6 " ?"       " 5th-6th"    " Black"                  2
##  7 " ?"       " 7th-8th"    " Black"                  2
##  8 " ?"       " 9th"        " Black"                  3
##  9 " ?"       " Assoc-acdm" " Black"                  2
## 10 " ?"       " Assoc-voc"  " Asian-Pac-Islander"     1
## # ℹ 248 more rows

Visualizations of these combinations:

library(ggplot2)
ggplot(combination_counts, aes(x = combination_counts$occupation, y = combination_counts$Count))+   geom_bar(stat = "identity", position =     position_dodge(width = 0.8)) +
  facet_wrap(~combination_counts$education) +
  labs(
    x = "occupation",
    y = "Count",
    fill = "education",
    title = "Grouped Barplot for occupation, count and education"
  ) +
  theme_minimal() + scale_fill_discrete() +  theme(legend.position = "top") + theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(combination_counts, aes(x = combination_counts$occupation, y = combination_counts$education, fill = combination_counts$Count)) +
  geom_tile() +
  scale_fill_gradient(low = "blue", high = "green") +
  labs(x = "occupation", y = "education", fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(combination_counts, aes(x = combination_counts$race, y = combination_counts$education, fill = combination_counts$Count)) +
  geom_tile() +
  scale_fill_gradient(low = "orange", high ="white") +
  labs(x = "race", y = "education", fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Conclusion

From the whether graphs we can conclude few points like there are more white people who completed the HS grad education compared to others and people who completed the HS Grad study are more in the field of craft reparing occupation. Also most of the people who are doing some job are from mostly HS grad, pre-school and some college degree i.e. there are very few people who completed higher degree like phd. There are some fields in the dataset which has “?” as the instance. This value might not effect the categorical attribute, but definitely effects the numerical variable. This could mislead the analysis by not summarizing and aggregating the values. Hence, plotting against those variable makes no sense.