Lab 5

Author

Jason Flores

library(datasetsICR)
library(ggplot2)
data("german")
head(german)
  Age Gender Housing Saving accounts Checking account Credit amount Duration
1  67   male     own            <NA>           little          1169        6
2  22 female     own          little         moderate          5951       48
3  49   male     own          little             <NA>          2096       12
4  45   male    free          little           little          7882       42
5  53   male    free          little           little          4870       24
6  35   male    free            <NA>             <NA>          9055       36
              Purpose Class Risk
1            radio/TV          1
2            radio/TV          2
3           education          1
4 furniture/equipment          1
5                 car          2
6           education          1

Part One

library(socviz)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
pip1 <- german %>%   
  filter(!is.na('Gender') & !is.na('Housing')) %>%
  group_by(Gender, Housing) %>%
  summarize(N = n()) %>%
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
`summarise()` has grouped output by 'Gender'. You can override using the
`.groups` argument.
pip1
# A tibble: 6 × 5
# Groups:   Gender [2]
  Gender Housing     N   freq   pct
  <chr>  <chr>   <int>  <dbl> <dbl>
1 female free       19 0.0613     6
2 female own       196 0.632     63
3 female rent       95 0.306     31
4 male   free       89 0.129     13
5 male   own       517 0.749     75
6 male   rent       84 0.122     12
head(german, n=20)
   Age Gender Housing Saving accounts Checking account Credit amount Duration
1   67   male     own            <NA>           little          1169        6
2   22 female     own          little         moderate          5951       48
3   49   male     own          little             <NA>          2096       12
4   45   male    free          little           little          7882       42
5   53   male    free          little           little          4870       24
6   35   male    free            <NA>             <NA>          9055       36
7   53   male     own      quite rich             <NA>          2835       24
8   35   male    rent          little         moderate          6948       36
9   61   male     own            rich             <NA>          3059       12
10  28   male     own          little         moderate          5234       30
11  25 female    rent          little         moderate          1295       12
12  24 female    rent          little           little          4308       48
13  22 female     own          little         moderate          1567       12
14  60   male     own          little           little          1199       24
15  28 female    rent          little           little          1403       15
16  32 female     own        moderate           little          1282       24
17  53   male     own            <NA>             <NA>          2424       24
18  25   male     own            <NA>           little          8072       30
19  44 female    free          little         moderate         12579       24
20  31   male     own      quite rich             <NA>          3430       24
               Purpose Class Risk
1             radio/TV          1
2             radio/TV          2
3            education          1
4  furniture/equipment          1
5                  car          2
6            education          1
7  furniture/equipment          1
8                  car          1
9             radio/TV          1
10                 car          2
11                 car          2
12            business          2
13            radio/TV          1
14                 car          2
15                 car          1
16            radio/TV          2
17            radio/TV          1
18            business          1
19                 car          2
20            radio/TV          1

This data shows housing status of ownership of housing between male and females. It shows saving accounts and checking accounts and its purpose for their accounts. Looking at the first 20 observations, the data does seem to make sense and the numbers do match what the world can looking like.

Part 2

#Stacked Bar Chart
p <- ggplot(data = subset(pip1, !is.na(Gender) & !is.na(Housing)), 
                        aes(x=Gender, y=pct, fill = Housing))

p + geom_col(position = "stack") +
    labs(x="Gender", y="Percent", fill = "Housing",
         title = "Housing By Gender", caption = "german dataset", 
         subtitle = "As a Stacked Bar Chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

#Dodged Bar Chart
p + geom_col(position = "dodge2") +
    labs(x="Gender", y="Percent", fill = "Housing",
         title = "Housing By Gender", caption = "german dataset", 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9)) 

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Housing", y="Percent", fill = "Gender",
         title = "Housing By Gender", caption = "german dataset", 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ Housing) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

Part Three

pip2 <- german %>%         
  group_by(Housing) %>%
  summarize(N = n(),
            age_mean = mean(Age, na.rm=TRUE), 
            credit_mean = mean(`Credit amount`, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2
# A tibble: 3 × 6
  Housing     N age_mean credit_mean  freq   pct
  <chr>   <int>    <dbl>       <dbl> <dbl> <dbl>
1 free      108     43.8       4906. 0.108    11
2 own       713     35.6       3061. 0.713    71
3 rent      179     30.4       3123. 0.179    18

After summarizing just one categorical variable and finding the mean for to numerical variables, we can really dive deeper into what the numbers mean for the different categories of housing. The age mean is a lot higher for those with free housing then own and rent. Also people with free housing have a lot higher credit amount but also they have the least amount of observations which can affect mean.

Part Four

p <- ggplot(pip2, aes(x=credit_mean, y=age_mean, color=Housing))
p + geom_point(size=6) +
    annotate(geom = "text", x = 4100, y=45, 
                     label = "Average Age for Free is 44", hjust=0) +
    labs(y="Average Age", x="Average Credit Amount", 
         title="Age and Credit Amount by Housing", 
         subtitle = "Free Housing Has The Most Credit Amount",
         caption = ("german dataset{socviz}"))

Part Five

  p <- ggplot(pip2, aes(x=credit_mean, y=age_mean, color=Housing))
p + geom_point(size=6) +
    annotate(geom = "text", x = 4100, y=45, 
                     label = "Average Age for Free is 44", hjust=0)+
    annotate(geom = "rect", xmin = 4750, xmax = 5000,
             ymin = 42.5, ymax = 44.5, fill = "orange", alpha = 0.2) + 
    labs(y="Average Age", x="Average Credit Amount", 
         title="Age and Credit Amount by Housing", 
         subtitle = "Free Housing Has The Most Credit Amount",
         caption = "german dataset{datasetsICR}")+
theme(legend.title = element_text(color="gray50", size=13, face="bold"))

Part Six

p <- ggplot(pip2, aes(x=credit_mean, y=age_mean, color=Housing))
p + geom_point(size=6) +
    annotate(geom = "text", x = 4300, y=45, 
                     label = "Average Age for Free is 44", hjust=0) +
    labs(y="Average Age", x="Average Credit Amount", 
         title="Age and Credit Amount by Housing", 
         subtitle = "Free Housing Has The Most Credit Amount",
         caption = "german dataset{datasetsICR}")+
   theme(legend.position = "none")

Part Seven - Interpretation

After making different charts and graphs for the variables, we can see the main differences between the different types of housing ownership of different people. Free Housing is the outlier in the set because their average age is higher and their credit amount is higher as well. This makes sense because the older you are, the amount of credit that a person has should be higher. However people who rent have a slightly higer credit amount than those who own since they are constantly paying a bill every month.