This is a final project to show off what you have learned. Select your data set from the list below: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list). Another good source is found here:https://https://archive.ics.uci.edu/ml/datasets.html. The present ation approach is up to you but it should contain the following:

Problem we are trying to find:

This dataset has more than 32000 records containing individual information like age, income, country etc. We will try to find out the composition by region (country), the demographics, the relationship between age, education and income.

1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the dataset. Please include some conclusions in the R Markdown text.

#Read remote data
datafrm = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", sep=",",col.names=c("age", "type_employer", "fnlwgt", "education", "education_num","marital", "occupation", "relationship", "race","sex", "capital_gain", "capital_loss", "hr_per_week","country", "income"), fill=FALSE, strip.white=T)

#To convert to dataframe
#datafrm <- as.data.frame(datafrm)
#typeof(datafrm)

#View a sample of the data
head(datafrm)
##   age    type_employer fnlwgt education education_num            marital
## 1  39        State-gov  77516 Bachelors            13      Never-married
## 2  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3  38          Private 215646   HS-grad             9           Divorced
## 4  53          Private 234721      11th             7 Married-civ-spouse
## 5  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital_gain capital_loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hr_per_week       country income
## 1          40 United-States  <=50K
## 2          13 United-States  <=50K
## 3          40 United-States  <=50K
## 4          40 United-States  <=50K
## 5          40          Cuba  <=50K
## 6          40 United-States  <=50K
#Check the summary
summary(datafrm)
##       age                 type_employer       fnlwgt       
##  Min.   :17.00   Private         :22696   Min.   :  12285  
##  1st Qu.:28.00   Self-emp-not-inc: 2541   1st Qu.: 117827  
##  Median :37.00   Local-gov       : 2093   Median : 178356  
##  Mean   :38.58   ?               : 1836   Mean   : 189778  
##  3rd Qu.:48.00   State-gov       : 1298   3rd Qu.: 237051  
##  Max.   :90.00   Self-emp-inc    : 1116   Max.   :1484705  
##                  (Other)         :  981                    
##         education     education_num                    marital     
##  HS-grad     :10501   Min.   : 1.00   Divorced             : 4443  
##  Some-college: 7291   1st Qu.: 9.00   Married-AF-spouse    :   23  
##  Bachelors   : 5355   Median :10.00   Married-civ-spouse   :14976  
##  Masters     : 1723   Mean   :10.08   Married-spouse-absent:  418  
##  Assoc-voc   : 1382   3rd Qu.:12.00   Never-married        :10683  
##  11th        : 1175   Max.   :16.00   Separated            : 1025  
##  (Other)     : 5134                   Widowed              :  993  
##            occupation           relationship                   race      
##  Prof-specialty :4140   Husband       :13193   Amer-Indian-Eskimo:  311  
##  Craft-repair   :4099   Not-in-family : 8305   Asian-Pac-Islander: 1039  
##  Exec-managerial:4066   Other-relative:  981   Black             : 3124  
##  Adm-clerical   :3770   Own-child     : 5068   Other             :  271  
##  Sales          :3650   Unmarried     : 3446   White             :27816  
##  Other-service  :3295   Wife          : 1568                             
##  (Other)        :9541                                                    
##      sex         capital_gain    capital_loss     hr_per_week   
##  Female:10771   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :21790   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1078   Mean   :  87.3   Mean   :40.44  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :4356.0   Max.   :99.00  
##                                                                 
##           country        income     
##  United-States:29170   <=50K:24720  
##  Mexico       :  643   >50K : 7841  
##  ?            :  583                
##  Philippines  :  198                
##  Germany      :  137                
##  Canada       :  121                
##  (Other)      : 1709
#Explore data (sample done for few columns)
min(datafrm$age)
## [1] 17
max(datafrm$age)
## [1] 90
mean(datafrm$age)
## [1] 38.58165
median(datafrm$age)
## [1] 37
range(datafrm$age)
## [1] 17 90
quantile(datafrm$age)
##   0%  25%  50%  75% 100% 
##   17   28   37   48   90
#type of column age
typeof(datafrm$age)
## [1] "integer"
table(datafrm$type_employer)
## 
##                ?      Federal-gov        Local-gov     Never-worked 
##             1836              960             2093                7 
##          Private     Self-emp-inc Self-emp-not-inc        State-gov 
##            22696             1116             2541             1298 
##      Without-pay 
##               14
typeof(datafrm$type_employer)
## [1] "integer"
str(datafrm)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 9 levels "?","Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation   : Factor w/ 15 levels "?","Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
##  $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race         : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
colnames(datafrm)
##  [1] "age"           "type_employer" "fnlwgt"        "education"    
##  [5] "education_num" "marital"       "occupation"    "relationship" 
##  [9] "race"          "sex"           "capital_gain"  "capital_loss" 
## [13] "hr_per_week"   "country"       "income"

As you see the employer type can be grouped together. It is also of type integer which has to change. We can do it in data wrangling stage.

2. Data wrangling: Please perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data (for example - if it makes sense you could sum two columns together)

Drop columns that we don’t need. Also we can rename the column type_employer to employer_type because it is easy to read

#Remove fields that we don't need
datafrm[["education_num"]]=NULL
datafrm[["fnlwgt"]]=NULL
datafrm[["capital_gain"]]=NULL
datafrm[["capital_loss"]]=NULL

#Check if the columns are dropped
head(datafrm)
##   age    type_employer education            marital        occupation
## 1  39        State-gov Bachelors      Never-married      Adm-clerical
## 2  50 Self-emp-not-inc Bachelors Married-civ-spouse   Exec-managerial
## 3  38          Private   HS-grad           Divorced Handlers-cleaners
## 4  53          Private      11th Married-civ-spouse Handlers-cleaners
## 5  28          Private Bachelors Married-civ-spouse    Prof-specialty
## 6  37          Private   Masters Married-civ-spouse   Exec-managerial
##    relationship  race    sex hr_per_week       country income
## 1 Not-in-family White   Male          40 United-States  <=50K
## 2       Husband White   Male          13 United-States  <=50K
## 3 Not-in-family White   Male          40 United-States  <=50K
## 4       Husband Black   Male          40 United-States  <=50K
## 5          Wife Black Female          40          Cuba  <=50K
## 6          Wife White Female          40 United-States  <=50K
colnames(datafrm)[2] <- "empoyer_type"

datafrm$empoyer_type = as.character(datafrm$empoyer_type)

#Check to make sure the column type changed
typeof(datafrm$empoyer_type)
## [1] "character"
#Combine employer type
datafrm$empoyer_type = gsub("^Federal-gov","Govt",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Local-gov","Govt",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^State-gov","Govt",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Private","Private",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Self-emp-inc","Self-Employed",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Self-emp-not-inc","Self-Employed",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Without-pay","Not-Working",datafrm$empoyer_type)
datafrm$empoyer_type = gsub("^Never-worked","Not-Working",datafrm$empoyer_type)

table(datafrm$empoyer_type)
## 
##             ?          Govt   Not-Working       Private Self-Employed 
##          1836          4351            21         22696          3657
table(datafrm$occupation)
## 
##                 ?      Adm-clerical      Armed-Forces      Craft-repair 
##              1843              3770                 9              4099 
##   Exec-managerial   Farming-fishing Handlers-cleaners Machine-op-inspct 
##              4066               994              1370              2002 
##     Other-service   Priv-house-serv    Prof-specialty   Protective-serv 
##              3295               149              4140               649 
##             Sales      Tech-support  Transport-moving 
##              3650               928              1597
datafrm$occupation = as.character(datafrm$occupation)

#Check to make sure the column type changed
typeof(datafrm$occupation)
## [1] "character"
datafrm$occupation = gsub("^Adm-clerical","Admin",datafrm$occupation)
datafrm$occupation = gsub("^Armed-Forces","Military",datafrm$occupation)
datafrm$occupation = gsub("^Craft-repair","Blue-Collar",datafrm$occupation)
datafrm$occupation = gsub("^Exec-managerial","White-Collar",datafrm$occupation)
datafrm$occupation = gsub("^Farming-fishing","Blue-Collar",datafrm$occupation)
datafrm$occupation = gsub("^Handlers-cleaners","Blue-Collar",datafrm$occupation)
datafrm$occupation = gsub("^Machine-op-inspct","Blue-Collar",datafrm$occupation)
datafrm$occupation = gsub("^Other-service","Service",datafrm$occupation)
datafrm$occupation = gsub("^Priv-house-serv","Service",datafrm$occupation)
datafrm$occupation = gsub("^Prof-specialty","Professional",datafrm$occupation)
datafrm$occupation = gsub("^Protective-serv","Other",datafrm$occupation)
datafrm$occupation = gsub("^Sales","Sales",datafrm$occupation)
datafrm$occupation = gsub("^Tech-support","Other",datafrm$occupation)
datafrm$occupation = gsub("^Transport-moving","Blue-Collar",datafrm$occupation)

Group countries by region/continent

table(datafrm$country)
## 
##                          ?                   Cambodia 
##                        583                         19 
##                     Canada                      China 
##                        121                         75 
##                   Columbia                       Cuba 
##                         59                         95 
##         Dominican-Republic                    Ecuador 
##                         70                         28 
##                El-Salvador                    England 
##                        106                         90 
##                     France                    Germany 
##                         29                        137 
##                     Greece                  Guatemala 
##                         29                         64 
##                      Haiti         Holand-Netherlands 
##                         44                          1 
##                   Honduras                       Hong 
##                         13                         20 
##                    Hungary                      India 
##                         13                        100 
##                       Iran                    Ireland 
##                         43                         24 
##                      Italy                    Jamaica 
##                         73                         81 
##                      Japan                       Laos 
##                         62                         18 
##                     Mexico                  Nicaragua 
##                        643                         34 
## Outlying-US(Guam-USVI-etc)                       Peru 
##                         14                         31 
##                Philippines                     Poland 
##                        198                         60 
##                   Portugal                Puerto-Rico 
##                         37                        114 
##                   Scotland                      South 
##                         12                         80 
##                     Taiwan                   Thailand 
##                         51                         18 
##            Trinadad&Tobago              United-States 
##                         19                      29170 
##                    Vietnam                 Yugoslavia 
##                         67                         16
datafrm$country = as.character(datafrm$country)

#Check to make sure the column type changed
typeof(datafrm$country)
## [1] "character"
datafrm$country[datafrm$country=="Cambodia"] = "Asia"
datafrm$country[datafrm$country=="Canada"] = "North-America"    
datafrm$country[datafrm$country=="China"] = "Asia"       
datafrm$country[datafrm$country=="Columbia"] = "South-America"    
datafrm$country[datafrm$country=="Cuba"] = "North-America"        
datafrm$country[datafrm$country=="Dominican-Republic"] = "North-America"
datafrm$country[datafrm$country=="Ecuador"] = "South-America"     
datafrm$country[datafrm$country=="El-Salvador"] = "South-America" 
datafrm$country[datafrm$country=="England"] = "Europe"
datafrm$country[datafrm$country=="France"] = "Europe"
datafrm$country[datafrm$country=="Germany"] = "Europe"
datafrm$country[datafrm$country=="Greece"] = "Europe"
datafrm$country[datafrm$country=="Guatemala"] = "North-America"
datafrm$country[datafrm$country=="Haiti"] = "North-America"
datafrm$country[datafrm$country=="Holand-Netherlands"] = "Europe"
datafrm$country[datafrm$country=="Honduras"] = "North-America"
datafrm$country[datafrm$country=="Hong"] = "Asia"
datafrm$country[datafrm$country=="Hungary"] = "Europe"
datafrm$country[datafrm$country=="India"] = "Asia"
datafrm$country[datafrm$country=="Iran"] = "Asia"
datafrm$country[datafrm$country=="Ireland"] = "Australia"
datafrm$country[datafrm$country=="Italy"] = "Europe"
datafrm$country[datafrm$country=="Jamaica"] = "North-America"
datafrm$country[datafrm$country=="Japan"] = "Asia"
datafrm$country[datafrm$country=="Laos"] = "Asia"
datafrm$country[datafrm$country=="Mexico"] = "North-America"
datafrm$country[datafrm$country=="Nicaragua"] = "North-America"
datafrm$country[datafrm$country=="Outlying-US(Guam-USVI-etc)"] = "North-America"
datafrm$country[datafrm$country=="Peru"] = "South-America"
datafrm$country[datafrm$country=="Philippines"] = "Asia"
datafrm$country[datafrm$country=="Poland"] = "Europe"
datafrm$country[datafrm$country=="Portugal"] = "Europe"
datafrm$country[datafrm$country=="Puerto-Rico"] = "North-America"
datafrm$country[datafrm$country=="Scotland"] = "Europe"
datafrm$country[datafrm$country=="South"] = "Europe"
datafrm$country[datafrm$country=="Taiwan"] = "Asia"
datafrm$country[datafrm$country=="Thailand"] = "Asia"
datafrm$country[datafrm$country=="Trinadad&Tobago"] = "North-America"
datafrm$country[datafrm$country=="United-States"] = "North-America"
datafrm$country[datafrm$country=="Vietnam"] = "Asia"
datafrm$country[datafrm$country=="Yugoslavia"] = "Europe"

table(datafrm$country)
## 
##             ?          Asia     Australia        Europe North-America 
##           583           671            24           577         30482 
## South-America 
##           224
set.seed(1)
agesg <- floor(runif(datafrm$age, min = 20, max = 50))
datafrm$agegroup <- findInterval(agesg, c(20, 30, 40))

head(datafrm)
##   age  empoyer_type education            marital   occupation
## 1  39          Govt Bachelors      Never-married        Admin
## 2  50 Self-Employed Bachelors Married-civ-spouse White-Collar
## 3  38       Private   HS-grad           Divorced  Blue-Collar
## 4  53       Private      11th Married-civ-spouse  Blue-Collar
## 5  28       Private Bachelors Married-civ-spouse Professional
## 6  37       Private   Masters Married-civ-spouse White-Collar
##    relationship  race    sex hr_per_week       country income agegroup
## 1 Not-in-family White   Male          40 North-America  <=50K        1
## 2       Husband White   Male          13 North-America  <=50K        2
## 3 Not-in-family White   Male          40 North-America  <=50K        2
## 4       Husband Black   Male          40 North-America  <=50K        3
## 5          Wife Black Female          40 North-America  <=50K        1
## 6          Wife White Female          40 North-America  <=50K        3

3. Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.

Traditional Charts

plot(datafrm$age)

plot(datafrm$age, axes=F, ylim=c(0,100), typ='l', ann=F)

hist(datafrm$age)

ggplot2 charts

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
datafrm$dose <- as.factor(datafrm$age)
head(datafrm)
##   age  empoyer_type education            marital   occupation
## 1  39          Govt Bachelors      Never-married        Admin
## 2  50 Self-Employed Bachelors Married-civ-spouse White-Collar
## 3  38       Private   HS-grad           Divorced  Blue-Collar
## 4  53       Private      11th Married-civ-spouse  Blue-Collar
## 5  28       Private Bachelors Married-civ-spouse Professional
## 6  37       Private   Masters Married-civ-spouse White-Collar
##    relationship  race    sex hr_per_week       country income agegroup
## 1 Not-in-family White   Male          40 North-America  <=50K        1
## 2       Husband White   Male          13 North-America  <=50K        2
## 3 Not-in-family White   Male          40 North-America  <=50K        2
## 4       Husband Black   Male          40 North-America  <=50K        3
## 5          Wife Black Female          40 North-America  <=50K        1
## 6          Wife White Female          40 North-America  <=50K        3
##   dose
## 1   39
## 2   50
## 3   38
## 4   53
## 5   28
## 6   37
ggplot(data=datafrm, aes(x=age, y = income), color=drv) + geom_point() + geom_quantile() + ggtitle("Quantile plot - Age vs Income")
## Warning: Computation failed in `stat_quantile()`:
## Package `quantreg` required for `stat_quantile`.
## Please install and try again.

ggplot(data=datafrm, aes(x=log(datafrm$age))) + geom_bar() + scale_fill_gradient(low="red",high = "yellow") + ggtitle("Bar Chart - log(Age)")

ggplot(data=datafrm, aes(x=country), color=country) + geom_bar() + ggtitle("Bar Chart - Region/Country")

ggplot(data=datafrm, aes(x=education), color=education) + geom_bar() + ggtitle("Education")

ggplot(data=datafrm, aes(x=sex), color=sex) + geom_bar(aes(weight = age)) + scale_fill_gradient(low = "red",high = "yellow") + ggtitle("Bar Chart - Male vs Female")

ggplot(datafrm, aes(x = age)) + geom_histogram(color='white') + ggtitle("Histogram - Age")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(datafrm, aes(x = age, y=country)) + geom_point(color='darkblue') + ggtitle("Age composition by Region")

#Scatter plot - age vs income
ggplot(datafrm, aes(x=age, y=hr_per_week )) + geom_point(size=1, shape=23) + ggtitle("Scatter plot - Age vs Hours/Week")

#boxplot
ggplot(datafrm, aes(x=country, y=age)) + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=2, notch=FALSE) + ggtitle("Box Plot - Age vs Country")

ggplot(datafrm, aes(x=age, y=country)) + stat_summary(fun.y=mean, geom="point", shape=23, size=2) + ggtitle("Age vs Country")

ggplot(datafrm, aes(x=occupation, y=agegroup)) + geom_boxplot() + ggtitle("Box Plot - Age (Group) vs Occupation")

4. Meaningful question for analysis: Please state at the beginning a meaningful question for analysis. Use the first three steps and anything else that would be helpful to answer the question you are posing from the data set you chose. Please write a brief conclusion paragraph in R markdown at the end.

Conclusion/Analysis

1. From the above chart we see that there are more people who start early and end late in life who earn <=50k.

2. The dataset contains more young people than elderly.

3. The barchart by region shows that the dataset contains more people from North-America than other regions.So any conclusions derived might be mostly applicable to that region.

4. There is about 2:1 ratio of Male to Female in the dataset.

5. The ‘Age composition by Region’ chart shows younger workforce i regions other than North-America. We have to determine if it is because the data was collected most for Nort-America.

6. The Age vs Hours/Wekk scatter plot indicated that most of the people of age till 60 work mostly between 40 to 50 hours a week. It also shows the mid to younger ages works more than the normal 40 hours a week, an indication of overtime. This could be an indicator of high cost of living. This hypothesis would need more investigation.

Solution:

Create a git project. Go to the project directory

Commands:

echo “# CUNY_R” >> README.md

git init

git add README.md

git add adult.csv

git status

git commit -m “r data for bridge HW3”

git remote add origin https://github.com/monuchacko/CUNY_R.git

git push -u origin master

require(RCurl)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.4.4
## Loading required package: bitops
## Warning: package 'bitops' was built under R version 3.4.4
#Read data from git (Get the raw url to avoid auth errors)
datafrmgit = read.csv("https://raw.githubusercontent.com/monuchacko/CUNY_R/master/adult.csv", sep=",",col.names=c("age", "type_employer", "fnlwgt", "education", "education_num","marital", "occupation", "relationship", "race","sex", "capital_gain", "capital_loss", "hr_per_week","country", "income"), fill=FALSE, strip.white=T)

#Check the data
head(datafrmgit)
##   age    type_employer fnlwgt education education_num
## 1  50 Self-emp-not-inc  83311 Bachelors            13
## 2  38          Private 215646   HS-grad             9
## 3  53          Private 234721      11th             7
## 4  28          Private 338409 Bachelors            13
## 5  37          Private 284582   Masters            14
## 6  49          Private 160187       9th             5
##                 marital        occupation  relationship  race    sex
## 1    Married-civ-spouse   Exec-managerial       Husband White   Male
## 2              Divorced Handlers-cleaners Not-in-family White   Male
## 3    Married-civ-spouse Handlers-cleaners       Husband Black   Male
## 4    Married-civ-spouse    Prof-specialty          Wife Black Female
## 5    Married-civ-spouse   Exec-managerial          Wife White Female
## 6 Married-spouse-absent     Other-service Not-in-family Black Female
##   capital_gain capital_loss hr_per_week       country income
## 1            0            0          13 United-States  <=50K
## 2            0            0          40 United-States  <=50K
## 3            0            0          40 United-States  <=50K
## 4            0            0          40          Cuba  <=50K
## 5            0            0          40 United-States  <=50K
## 6            0            0          16       Jamaica  <=50K