library(tidyverse)
library(readr)
library(dplyr)
library(psych)

Data has been taken from various sources, including classmates and various websites

Halloween

The data is first read in using the read_csv function and displayed using head(). This shows what type of data is being worked with.

candyhierarchy2017 <- read_csv("candyhierarchy2017.csv")

## Warning: Missing column names filled in: 'X114' [114]

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Internal ID` = col_integer(),
##   `Q12: MEDIA [Daily Dish]` = col_integer(),
##   `Q12: MEDIA [Science]` = col_integer(),
##   `Q12: MEDIA [ESPN]` = col_integer(),
##   `Q12: MEDIA [Yahoo]` = col_integer()
## )

## See spec(...) for full column specifications.

head(candyhierarchy2017)

## # A tibble: 6 x 120
##   `Internal ID` `Q1: GOING OUT?` `Q2: GENDER` `Q3: AGE` `Q4: COUNTRY`
##           <int> <chr>            <chr>        <chr>     <chr>        
## 1      90258773 <NA>             <NA>         <NA>      <NA>         
## 2      90272821 No               Male         44        USA          
## 3      90272829 <NA>             Male         49        USA          
## 4      90272840 No               Male         40        us           
## 5      90272841 No               Male         23        usa          
## 6      90272852 No               Male         <NA>      <NA>         
## # ... with 115 more variables: `Q5: STATE, PROVINCE, COUNTY, ETC` <chr>,
## #   `Q6: 100 Grand Bar` <chr>, `Q6 | Anonymous brown globs that come in
## #   black and orange wrappers\t(a.k.a. Mary Janes)` <chr>, `Q6 | Any
## #   full-sized candy bar` <chr>, `Q6 | Black Jacks` <chr>, `Q6 | Bonkers
## #   (the candy)` <chr>, `Q6 | Bonkers (the board game)` <chr>, `Q6 |
## #   Bottle Caps` <chr>, `Q6 | Box'o'Raisins` <chr>, `Q6 | Broken glow
## #   stick` <chr>, `Q6 | Butterfinger` <chr>, `Q6 | Cadbury Creme
## #   Eggs` <chr>, `Q6 | Candy Corn` <chr>, `Q6 | Candy that is clearly just
## #   the stuff given out for free at restaurants` <chr>, `Q6 |
## #   Caramellos` <chr>, `Q6 | Cash, or other forms of legal tender` <chr>,
## #   `Q6 | Chardonnay` <chr>, `Q6 | Chick-o-Sticks (we don\xd5t know what
## #   that is)` <chr>, `Q6 | Chiclets` <chr>, `Q6 | Coffee Crisp` <chr>, `Q6
## #   | Creepy Religious comics/Chick Tracts` <chr>, `Q6 | Dental
## #   paraphenalia` <chr>, `Q6 | Dots` <chr>, `Q6 | Dove Bars` <chr>, `Q6 |
## #   Fuzzy Peaches` <chr>, `Q6 | Generic Brand Acetaminophen` <chr>, `Q6 |
## #   Glow sticks` <chr>, `Q6 | Goo Goo Clusters` <chr>, `Q6 | Good N'
## #   Plenty` <chr>, `Q6 | Gum from baseball cards` <chr>, `Q6 | Gummy Bears
## #   straight up` <chr>, `Q6 | Hard Candy` <chr>, `Q6 | Healthy
## #   Fruit` <chr>, `Q6 | Heath Bar` <chr>, `Q6 | Hershey's Dark
## #   Chocolate` <chr>, `Q6 | Hershey\xd5s Milk Chocolate` <chr>, `Q6 |
## #   Hershey's Kisses` <chr>, `Q6 | Hugs (actual physical hugs)` <chr>, `Q6
## #   | Jolly Rancher (bad flavor)` <chr>, `Q6 | Jolly Ranchers (good
## #   flavor)` <chr>, `Q6 | JoyJoy (Mit Iodine!)` <chr>, `Q6 | Junior
## #   Mints` <chr>, `Q6 | Senior Mints` <chr>, `Q6 | Kale smoothie` <chr>,
## #   `Q6 | Kinder Happy Hippo` <chr>, `Q6 | Kit Kat` <chr>, `Q6 |
## #   LaffyTaffy` <chr>, `Q6 | LemonHeads` <chr>, `Q6 | Licorice (not
## #   black)` <chr>, `Q6 | Licorice (yes black)` <chr>, `Q6 | Lindt
## #   Truffle` <chr>, `Q6 | Lollipops` <chr>, `Q6 | Mars` <chr>, `Q6 |
## #   Maynards` <chr>, `Q6 | Mike and Ike` <chr>, `Q6 | Milk Duds` <chr>,
## #   `Q6 | Milky Way` <chr>, `Q6 | Regular M&Ms` <chr>, `Q6 | Peanut
## #   M&M\xd5s` <chr>, `Q6 | Blue M&M's` <chr>, `Q6 | Red M&M's` <chr>, `Q6
## #   | Green Party M&M's` <chr>, `Q6 | Independent M&M's` <chr>, `Q6 |
## #   Abstained from M&M'ing.` <chr>, `Q6 | Minibags of chips` <chr>, `Q6 |
## #   Mint Kisses` <chr>, `Q6 | Mint Juleps` <chr>, `Q6 | Mr.
## #   Goodbar` <chr>, `Q6 | Necco Wafers` <chr>, `Q6 | Nerds` <chr>, `Q6 |
## #   Nestle Crunch` <chr>, `Q6 | Now'n'Laters` <chr>, `Q6 | Peeps` <chr>,
## #   `Q6 | Pencils` <chr>, `Q6 | Pixy Stix` <chr>, `Q6 | Real Housewives of
## #   Orange County Season 9 Blue-Ray` <chr>, `Q6 | Reese\xd5s Peanut Butter
## #   Cups` <chr>, `Q6 | Reese's Pieces` <chr>, `Q6 | Reggie Jackson
## #   Bar` <chr>, `Q6 | Rolos` <chr>, `Q6 | Sandwich-sized bags filled with
## #   BooBerry Crunch` <chr>, `Q6 | Skittles` <chr>, `Q6 | Smarties
## #   (American)` <chr>, `Q6 | Smarties (Commonwealth)` <chr>, `Q6 |
## #   Snickers` <chr>, `Q6 | Sourpatch Kids (i.e. abominations of
## #   nature)` <chr>, `Q6 | Spotted Dick` <chr>, `Q6 | Starburst` <chr>, `Q6
## #   | Sweet Tarts` <chr>, `Q6 | Swedish Fish` <chr>, `Q6 | Sweetums (a
## #   friend to diabetes)` <chr>, `Q6 | Take 5` <chr>, `Q6 | Tic
## #   Tacs` <chr>, `Q6 | Those odd marshmallow circus peanut things` <chr>,
## #   `Q6 | Three Musketeers` <chr>, `Q6 | Tolberone something or
## #   other` <chr>, `Q6 | Trail Mix` <chr>, `Q6 | Twix` <chr>, `Q6 | Vials
## #   of pure high fructose corn syrup, for main-lining into your
## #   vein` <chr>, `Q6 | Vicodin` <chr>, ...

Tidy and clean data

The columns are then renammed using the rename() function. This makes it easier to understand what each column means, and gives them meaningful names.

candyhierarchy2017 <- rename(candyhierarchy2017,id = 'Internal ID', 'Going out' = 'Q1: GOING OUT?', Sex = 'Q2: GENDER', Age = "Q3: AGE", Country = "Q4: COUNTRY", State = "Q5: STATE, PROVINCE, COUNTY, ETC", '100 Grand Bar' = 'Q6: 100 Grand Bar', 'Mary Janes'= 'Q6 | Anonymous brown globs that come in black and orange wrappers   (a.k.a. Mary Janes)', 'Full Size Candy Bar' = 'Q6 | Any full-sized candy bar', 'Black Jacks' = 'Q6 | Black Jacks', Bonkers = 'Q6 | Bonkers (the candy)', Butterfinger = "Q6 | Butterfinger", 'Candy Corn' = 'Q6 | Candy Corn')

head(candyhierarchy2017)

## # A tibble: 6 x 120
##         id `Going out` Sex   Age   Country State    `100 Grand Bar`
##      <int> <chr>       <chr> <chr> <chr>   <chr>    <chr>          
## 1 90258773 <NA>        <NA>  <NA>  <NA>    <NA>     <NA>           
## 2 90272821 No          Male  44    USA     NM       MEH            
## 3 90272829 <NA>        Male  49    USA     Virginia <NA>           
## 4 90272840 No          Male  40    us      or       MEH            
## 5 90272841 No          Male  23    usa     exton pa JOY            
## 6 90272852 No          Male  <NA>  <NA>    <NA>     JOY            
## # ... with 113 more variables: `Mary Janes` <chr>, `Full Size Candy
## #   Bar` <chr>, `Black Jacks` <chr>, Bonkers <chr>, `Q6 | Bonkers (the
## #   board game)` <chr>, `Q6 | Bottle Caps` <chr>, `Q6 |
## #   Box'o'Raisins` <chr>, `Q6 | Broken glow stick` <chr>,
## #   Butterfinger <chr>, `Q6 | Cadbury Creme Eggs` <chr>, `Candy
## #   Corn` <chr>, `Q6 | Candy that is clearly just the stuff given out for
## #   free at restaurants` <chr>, `Q6 | Caramellos` <chr>, `Q6 | Cash, or
## #   other forms of legal tender` <chr>, `Q6 | Chardonnay` <chr>, `Q6 |
## #   Chick-o-Sticks (we don\xd5t know what that is)` <chr>, `Q6 |
## #   Chiclets` <chr>, `Q6 | Coffee Crisp` <chr>, `Q6 | Creepy Religious
## #   comics/Chick Tracts` <chr>, `Q6 | Dental paraphenalia` <chr>, `Q6 |
## #   Dots` <chr>, `Q6 | Dove Bars` <chr>, `Q6 | Fuzzy Peaches` <chr>, `Q6 |
## #   Generic Brand Acetaminophen` <chr>, `Q6 | Glow sticks` <chr>, `Q6 |
## #   Goo Goo Clusters` <chr>, `Q6 | Good N' Plenty` <chr>, `Q6 | Gum from
## #   baseball cards` <chr>, `Q6 | Gummy Bears straight up` <chr>, `Q6 |
## #   Hard Candy` <chr>, `Q6 | Healthy Fruit` <chr>, `Q6 | Heath Bar` <chr>,
## #   `Q6 | Hershey's Dark Chocolate` <chr>, `Q6 | Hershey\xd5s Milk
## #   Chocolate` <chr>, `Q6 | Hershey's Kisses` <chr>, `Q6 | Hugs (actual
## #   physical hugs)` <chr>, `Q6 | Jolly Rancher (bad flavor)` <chr>, `Q6 |
## #   Jolly Ranchers (good flavor)` <chr>, `Q6 | JoyJoy (Mit
## #   Iodine!)` <chr>, `Q6 | Junior Mints` <chr>, `Q6 | Senior Mints` <chr>,
## #   `Q6 | Kale smoothie` <chr>, `Q6 | Kinder Happy Hippo` <chr>, `Q6 | Kit
## #   Kat` <chr>, `Q6 | LaffyTaffy` <chr>, `Q6 | LemonHeads` <chr>, `Q6 |
## #   Licorice (not black)` <chr>, `Q6 | Licorice (yes black)` <chr>, `Q6 |
## #   Lindt Truffle` <chr>, `Q6 | Lollipops` <chr>, `Q6 | Mars` <chr>, `Q6 |
## #   Maynards` <chr>, `Q6 | Mike and Ike` <chr>, `Q6 | Milk Duds` <chr>,
## #   `Q6 | Milky Way` <chr>, `Q6 | Regular M&Ms` <chr>, `Q6 | Peanut
## #   M&M\xd5s` <chr>, `Q6 | Blue M&M's` <chr>, `Q6 | Red M&M's` <chr>, `Q6
## #   | Green Party M&M's` <chr>, `Q6 | Independent M&M's` <chr>, `Q6 |
## #   Abstained from M&M'ing.` <chr>, `Q6 | Minibags of chips` <chr>, `Q6 |
## #   Mint Kisses` <chr>, `Q6 | Mint Juleps` <chr>, `Q6 | Mr.
## #   Goodbar` <chr>, `Q6 | Necco Wafers` <chr>, `Q6 | Nerds` <chr>, `Q6 |
## #   Nestle Crunch` <chr>, `Q6 | Now'n'Laters` <chr>, `Q6 | Peeps` <chr>,
## #   `Q6 | Pencils` <chr>, `Q6 | Pixy Stix` <chr>, `Q6 | Real Housewives of
## #   Orange County Season 9 Blue-Ray` <chr>, `Q6 | Reese\xd5s Peanut Butter
## #   Cups` <chr>, `Q6 | Reese's Pieces` <chr>, `Q6 | Reggie Jackson
## #   Bar` <chr>, `Q6 | Rolos` <chr>, `Q6 | Sandwich-sized bags filled with
## #   BooBerry Crunch` <chr>, `Q6 | Skittles` <chr>, `Q6 | Smarties
## #   (American)` <chr>, `Q6 | Smarties (Commonwealth)` <chr>, `Q6 |
## #   Snickers` <chr>, `Q6 | Sourpatch Kids (i.e. abominations of
## #   nature)` <chr>, `Q6 | Spotted Dick` <chr>, `Q6 | Starburst` <chr>, `Q6
## #   | Sweet Tarts` <chr>, `Q6 | Swedish Fish` <chr>, `Q6 | Sweetums (a
## #   friend to diabetes)` <chr>, `Q6 | Take 5` <chr>, `Q6 | Tic
## #   Tacs` <chr>, `Q6 | Those odd marshmallow circus peanut things` <chr>,
## #   `Q6 | Three Musketeers` <chr>, `Q6 | Tolberone something or
## #   other` <chr>, `Q6 | Trail Mix` <chr>, `Q6 | Twix` <chr>, `Q6 | Vials
## #   of pure high fructose corn syrup, for main-lining into your
## #   vein` <chr>, `Q6 | Vicodin` <chr>, `Q6 | Whatchamacallit Bars` <chr>,
## #   `Q6 | White Bread` <chr>, ...

The data is then filtered by columns. The data consists of 120 different variables, most which are not that useful. The data will filter out unnecessary columns from the dataset.

candy_remove_cols <- candyhierarchy2017[,-c(9:120)]
names(candy_remove_cols)

## [1] "id"            "Going out"     "Sex"           "Age"          
## [5] "Country"       "State"         "100 Grand Bar" "Mary Janes"

head(candy_remove_cols)

## # A tibble: 6 x 8
##         id `Going out` Sex   Age   Country State    `100 Grand Bar`
##      <int> <chr>       <chr> <chr> <chr>   <chr>    <chr>          
## 1 90258773 <NA>        <NA>  <NA>  <NA>    <NA>     <NA>           
## 2 90272821 No          Male  44    USA     NM       MEH            
## 3 90272829 <NA>        Male  49    USA     Virginia <NA>           
## 4 90272840 No          Male  40    us      or       MEH            
## 5 90272841 No          Male  23    usa     exton pa JOY            
## 6 90272852 No          Male  <NA>  <NA>    <NA>     JOY            
## # ... with 1 more variable: `Mary Janes` <chr>

candyhierarchy2017 <- candy_remove_cols

The data has many NA values - for this analysis, the NAs will be removed since there are still 1309 observations after the NA removal.

sum(is.na(candyhierarchy2017))

## [1] 1880

head(colSums(is.na(candyhierarchy2017)), 15)

##            id     Going out           Sex           Age       Country 
##             0           110            41            84            64 
##         State 100 Grand Bar    Mary Janes 
##           100           747           734

candy <- candyhierarchy2017

#remove all NA from dataset
candy2 <- na.omit(candy)
candy2

## # A tibble: 1,561 x 8
##          id `Going out` Sex    Age   Country State      `100 Grand Bar`
##       <int> <chr>       <chr>  <chr> <chr>   <chr>      <chr>          
##  1 90272821 No          Male   44    USA     NM         MEH            
##  2 90272840 No          Male   40    us      or         MEH            
##  3 90272841 No          Male   23    usa     exton pa   JOY            
##  4 90272854 No          Male   33    canada  ontario    JOY            
##  5 90272858 No          Male   40    Canada  Ontario    JOY            
##  6 90272859 No          Female 53    Us      Wa         MEH            
##  7 90272865 No          Male   56    Canada  Quebec     JOY            
##  8 90272866 No          Male   64    US      NY         MEH            
##  9 90272867 Yes         Male   43    Murica  California JOY            
## 10 90272868 No          Female 37    Canada  Ontario    MEH            
## # ... with 1,551 more rows, and 1 more variable: `Mary Janes` <chr>

Analysis

The data is then aggregated by sex, so we can get an idea of how many of each gender are in our dataset.

candy2_sex <- candy2 %>%
  group_by(Sex) %>%
  summarize(n=n())

candy2_sex

## # A tibble: 4 x 2
##   Sex                    n
##   <chr>              <int>
## 1 Female               545
## 2 I'd rather not say    48
## 3 Male                 948
## 4 Other                 20

Most of the participants in the data/survey are Male.

ggplot(data = candy2_sex, aes(candy2_sex$Sex, candy2_sex$n)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill=Sex)) +
  xlab("Sex") + ylab("Count")

The data shows that most people, for Halloween, are not going out.

candy2_goingout <- candy2 %>%
  group_by(`Going out`) %>%
  summarize(n=n())

candy2_goingout

## # A tibble: 2 x 2
##   `Going out`     n
##   <chr>       <int>
## 1 No           1347
## 2 Yes           214

ggplot(data = candy2_goingout, aes(candy2_goingout$`Going out`, candy2_goingout$n)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill=`Going out`)) +
  xlab("Sex") + ylab("Count")

This leads us to figure out how age impacts out survey.

candy2_age <- candy2 %>%
  group_by(Age) %>%
  summarize(n=n())

candy2_age

## # A tibble: 93 x 2
##    Age       n
##    <chr> <int>
##  1 ?         1
##  2 10        3
##  3 100       1
##  4 1000      1
##  5 102       1
##  6 11        4
##  7 12        6
##  8 13        4
##  9 14        2
## 10 15        5
## # ... with 83 more rows

Looking at the Age data, we can see that some participants didn’t take the survey seriously … “old enough”, “MY NAME JEFF”, “older than dirt”. We need to clean this.

Cleaning Cont.

candy2$Age <- as.integer(candy2$Age)

## Warning: NAs introduced by coercion

The data also has some … older people that probably wont be going out for Halloween (1000 year olds … or they’re vampires).

candy2_age <- candy2 %>%
  group_by(Age, Sex) %>%
  filter(Age < 90) %>%
  summarize(n=n())

Analysis and Conclusions

If we visual the age data we get the following:

ggplot(data = candy2_age, aes(candy2_age$Age, candy2_age$n)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill=candy2_age$Sex)) +
  xlab("Age") + ylab("Count") +
  geom_vline(xintercept=41.5, color="red", size=1)

The data resemble a normal distribution,with a mean and median of 41.5. It seems that for our data, Males around the age of 30-60 are not going out for halloween!

Marriages and Divorce

data <- read.csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/national_marriage_divorce_rates_00-16.csv")

head(data, 10)

##    ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016
## 1                                                                                
## 2                                                                            Year
## 3                                                                            2016
## 4                                                                            2015
## 5                                                                          2014/1
## 6                                                                          2013/1
## 7                                                                            2012
## 8                                                                            2011
## 9                                                                            2010
## 10                                                                           2009
##            X         X.1                             X.2 X.3 X.4 X.5 X.6
## 1                                                         NA  NA  NA  NA
## 2  Marriages  Population Rate per 1,000 total population  NA  NA  NA  NA
## 3  2,245,404 323,127,513                             6.9  NA  NA  NA  NA
## 4  2,221,579 321,418,820                             6.9  NA  NA  NA  NA
## 5  2,140,272 308,759,713                             6.9  NA  NA  NA  NA
## 6  2,081,301 306,136,672                             6.8  NA  NA  NA  NA
## 7  2,131,000 313,914,040                             6.8  NA  NA  NA  NA
## 8  2,118,000 311,591,917                             6.8  NA  NA  NA  NA
## 9  2,096,000 308,745,538                             6.8  NA  NA  NA  NA
## 10 2,080,000 306,771,529                             6.8  NA  NA  NA  NA
##    X.7 X.8
## 1   NA  NA
## 2   NA  NA
## 3   NA  NA
## 4   NA  NA
## 5   NA  NA
## 6   NA  NA
## 7   NA  NA
## 8   NA  NA
## 9   NA  NA
## 10  NA  NA

Tidy/Clean Data

The data has a lot of columns that don’t really make sense. The rename function will be applied using the tidyverse package in order to rename columns to more “user friendly” names.

data <- data %>%
  as.tbl() %>%
  select(ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016,
         X,
         X.1,
         X.2) %>%
  rename(num_marrage_rate = ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016, marriages = X, population = X.1, rate_per_1000 = X.2)

data

## # A tibble: 60 x 4
##    num_marrage_rate marriages population  rate_per_1000                  
##    <fct>            <fct>     <fct>       <fct>                          
##  1 ""               ""        ""          ""                             
##  2 Year             Marriages Population  Rate per 1,000 total population
##  3 2016             2,245,404 323,127,513 6.9                            
##  4 2015             2,221,579 321,418,820 6.9                            
##  5 2014/1           2,140,272 308,759,713 6.9                            
##  6 2013/1           2,081,301 306,136,672 6.8                            
##  7 2012             2,131,000 313,914,040 6.8                            
##  8 2011             2,118,000 311,591,917 6.8                            
##  9 2010             2,096,000 308,745,538 6.8                            
## 10 2009             2,080,000 306,771,529 6.8                            
## # ... with 50 more rows

Some of the data also has unnecessary characters, such as “/” - referring to footnotes. The gsub function can be used to parse the data and remove those characters.

data1 <- data
data1$num_marrage_rate <- gsub("/\\d", "", data$num_marrage_rate)
data1 <- data1[-c(1:2), ]
data1

## # A tibble: 58 x 4
##    num_marrage_rate marriages population  rate_per_1000
##    <chr>            <fct>     <fct>       <fct>        
##  1 2016             2,245,404 323,127,513 6.9          
##  2 2015             2,221,579 321,418,820 6.9          
##  3 2014             2,140,272 308,759,713 6.9          
##  4 2013             2,081,301 306,136,672 6.8          
##  5 2012             2,131,000 313,914,040 6.8          
##  6 2011             2,118,000 311,591,917 6.8          
##  7 2010             2,096,000 308,745,538 6.8          
##  8 2009             2,080,000 306,771,529 6.8          
##  9 2008             2,157,000 304,093,966 7.1          
## 10 2007             2,197,000 301,231,207 7.3          
## # ... with 48 more rows

The marriages column includes commas, so they can be removed so it make our calculations easier. Once these commas are omitted, the data is convereted into intergers instead of characters.

data1$marriages <- gsub(",", "", data1$marriages)
data1$population <- gsub(",", "", data1$population)
data1$marriages <- as.integer(data1$marriages)

## Warning: NAs introduced by coercion

data1$population <- as.integer(data1$population)

## Warning: NAs introduced by coercion

data1

## # A tibble: 58 x 4
##    num_marrage_rate marriages population rate_per_1000
##    <chr>                <int>      <int> <fct>        
##  1 2016               2245404  323127513 6.9          
##  2 2015               2221579  321418820 6.9          
##  3 2014               2140272  308759713 6.9          
##  4 2013               2081301  306136672 6.8          
##  5 2012               2131000  313914040 6.8          
##  6 2011               2118000  311591917 6.8          
##  7 2010               2096000  308745538 6.8          
##  8 2009               2080000  306771529 6.8          
##  9 2008               2157000  304093966 7.1          
## 10 2007               2197000  301231207 7.3          
## # ... with 48 more rows

The marriage and divorce are actually under the same column. We can fix this by seperating and saving those columns into new variables, and renaming.

We now have three cleaned variables - one for marriages one for divorce, and one combined variable.

Analysis and Conclusions

ggplot(data = df_divorce, aes(df_divorce$num_marrage_rate, df_divorce$divorce)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill=divorce)) +
  theme(legend.position="none") +
  xlab("Year") +
  ylab("Count") +
  ggtitle("Divorce")

ggplot(data = df_marrages, aes(df_marrages$num_marrage_rate, df_marrages$marriages)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill = marriages)) +
  theme(legend.position="none") +
  xlab("Year") +
  ylab("Count") +
  ggtitle("Marriages")

ggplot(df_combine, aes(num_marrage_rate)) + 
  geom_point(aes(y = marriages, colour = "marriages")) + 
  geom_point(aes(y = divorce, colour = "divorce")) + xlab("Year") + ylab("Count") +
  ggtitle("Marriage vs. Divorse")

In order to see how the # of marriages and divorces have changed, we can calculate a ratio of divorces/marriages

df_combine <- df_combine %>%
  mutate(dm_ratio = df_combine$divorce/df_combine$marriages)

ggplot(df_combine, aes(df_combine$num_marrage_rate, dm_ratio)) +
  geom_point(aes(color = dm_ratio)) + xlab("Year") + ylab("Ratio") +
  theme_light()

According to the data, it shows that the # of divorces have decreased over the dataset.

Baby Names

The head of the data is first displayed. This shows exactly what data are being worked with.

Popular_Baby_Names <- read_csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/Popular_Baby_Names.csv")

## Parsed with column specification:
## cols(
##   `Year of Birth` = col_integer(),
##   Gender = col_character(),
##   Ethnicity = col_character(),
##   `Child's First Name` = col_character(),
##   Count = col_integer(),
##   Rank = col_integer()
## )

head(Popular_Baby_Names)

## # A tibble: 6 x 6
##   `Year of Birth` Gender Ethnicity          `Child's First Na~ Count  Rank
##             <int> <chr>  <chr>              <chr>              <int> <int>
## 1            2011 FEMALE ASIAN AND PACIFIC~ SOPHIA               119     1
## 2            2011 FEMALE ASIAN AND PACIFIC~ CHLOE                106     2
## 3            2011 FEMALE ASIAN AND PACIFIC~ EMILY                 93     3
## 4            2011 FEMALE ASIAN AND PACIFIC~ OLIVIA                89     4
## 5            2011 FEMALE ASIAN AND PACIFIC~ EMMA                  75     5
## 6            2011 FEMALE ASIAN AND PACIFIC~ ISABELLA              67     6

Data Wrangling and Analysis

The first manipulation performed is an aggeragation by Year and Count. This shows how many total counts we have per year.

Names_recorded <- Popular_Baby_Names %>%
  group_by(`Year of Birth`) %>%
  summarize(n=n())

Names_recorded

## # A tibble: 6 x 2
##   `Year of Birth`     n
##             <int> <int>
## 1            2011  1937
## 2            2012  1851
## 3            2013  1734
## 4            2014  1715
## 5            2015  2045
## 6            2016  2063

The data is then visualized using ggplot2. This shows the above aggregation in visual form.

ggplot(data = Names_recorded, aes(Names_recorded$`Year of Birth`, Names_recorded$n)) + 
  stat_summary(fun.y = sum,
               geom = "bar", aes(fill = `Year of Birth`)) +
  xlab("Year of Birth") +
  ylab("Lab Recorded")

The data is then aggregated by ethnicity vs. count. This can give some insight to the total # of babies nammed for each ethnicity. Accoridng to the data, white non-hispanic, and hispanic have the most counts in the data.

ethnicitydata <- Popular_Baby_Names %>%
  group_by(Ethnicity) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

ethnicitydata

## # A tibble: 7 x 2
##   Ethnicity                      n
##   <chr>                      <int>
## 1 WHITE NON HISPANIC          3365
## 2 HISPANIC                    3314
## 3 BLACK NON HISPANIC          1704
## 4 ASIAN AND PACIFIC ISLANDER  1662
## 5 WHITE NON HISP               637
## 6 ASIAN AND PACI               335
## 7 BLACK NON HISP               328

The below is a visualization of the above data.

ggplot(ethnicitydata, aes(reorder(Ethnicity, -n), n)) +
         geom_bar(stat = "identity", aes(fill = Ethnicity)) +
         theme(axis.text.x=element_text(angle=45, hjust=1)) +
  xlab("Ethnicity") +
  theme(legend.position="none")

The data is then filtered to show the most popular names - Liam being the most popular in the dataset.

filter_names <- Popular_Baby_Names %>%
  filter(Count > 200) %>%
  select(`Child's First Name`, Count) %>%
  arrange(desc(Count))


filter_names

## # A tibble: 109 x 2
##    `Child's First Name` Count
##    <chr>                <int>
##  1 Liam                   387
##  2 Liam                   356
##  3 Jacob                  351
##  4 Dylan                  339
##  5 Ethan                  332
##  6 ISABELLA               331
##  7 Liam                   312
##  8 Dylan                  312
##  9 JUSTIN                 310
## 10 Matthew                308
## # ... with 99 more rows

filter_names <- Popular_Baby_Names %>%
  filter(Count > 200) %>%
  select(`Child's First Name`, Count) %>%
  arrange(desc(Count)) %>%
ggplot(., aes(`Child's First Name`, Count)) + 
  stat_summary(fun.y = sum,
               geom = "bar") +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  theme(legend.position="none") +
  ylab("Count of Popular Names") +
  xlab("Ethnicity")

filter_names

Popular_Baby_Names %>%
  group_by(Popular_Baby_Names$Ethnicity) %>%
  filter(Count > 300) %>%
  arrange(desc(Count))

## # A tibble: 13 x 7
## # Groups:   Popular_Baby_Names$Ethnicity [2]
##    `Year of Birth` Gender Ethnicity        `Child's First Nam~ Count  Rank
##              <int> <chr>  <chr>            <chr>               <int> <int>
##  1            2016 MALE   HISPANIC         Liam                  387     1
##  2            2015 MALE   HISPANIC         Liam                  356     1
##  3            2016 MALE   HISPANIC         Jacob                 351     2
##  4            2015 MALE   HISPANIC         Dylan                 339     2
##  5            2015 MALE   HISPANIC         Ethan                 332     3
##  6            2011 FEMALE HISPANIC         ISABELLA              331     1
##  7            2014 MALE   HISPANIC         Liam                  312     1
##  8            2016 MALE   HISPANIC         Dylan                 312     3
##  9            2011 MALE   HISPANIC         JUSTIN                310     2
## 10            2015 MALE   HISPANIC         Matthew               308     4
## 11            2015 FEMALE HISPANIC         Isabella              307     1
## 12            2013 MALE   WHITE NON HISPA~ David                 304     1
## 13            2011 MALE   HISPANIC         JACOB                 303     3
## # ... with 1 more variable: `Popular_Baby_Names$Ethnicity` <chr>

References:

http://www.scq.ubc.ca/so-much-candy-data-seriously/ https://sejdemyr.github.io/r-tutorials/basics/wide-and-long/ http://r4ds.had.co.nz/strings.html#other-uses-of-regular-expressions

Project 2

Amanda Arce

October 6, 2018

Halloween

Tidy and clean data

Analysis

Cleaning Cont.

Analysis and Conclusions

Marriages and Divorce

Tidy/Clean Data

Analysis and Conclusions

Baby Names

Data Wrangling and Analysis

References: