library(tidyverse)
library(readr)
library(dplyr)
library(psych)Data has been taken from various sources, including classmates and various websites
The data is first read in using the read_csv function and displayed using head(). This shows what type of data is being worked with.
candyhierarchy2017 <- read_csv("candyhierarchy2017.csv")## Warning: Missing column names filled in: 'X114' [114]
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Internal ID` = col_integer(),
## `Q12: MEDIA [Daily Dish]` = col_integer(),
## `Q12: MEDIA [Science]` = col_integer(),
## `Q12: MEDIA [ESPN]` = col_integer(),
## `Q12: MEDIA [Yahoo]` = col_integer()
## )
## See spec(...) for full column specifications.
head(candyhierarchy2017)## # A tibble: 6 x 120
## `Internal ID` `Q1: GOING OUT?` `Q2: GENDER` `Q3: AGE` `Q4: COUNTRY`
## <int> <chr> <chr> <chr> <chr>
## 1 90258773 <NA> <NA> <NA> <NA>
## 2 90272821 No Male 44 USA
## 3 90272829 <NA> Male 49 USA
## 4 90272840 No Male 40 us
## 5 90272841 No Male 23 usa
## 6 90272852 No Male <NA> <NA>
## # ... with 115 more variables: `Q5: STATE, PROVINCE, COUNTY, ETC` <chr>,
## # `Q6: 100 Grand Bar` <chr>, `Q6 | Anonymous brown globs that come in
## # black and orange wrappers\t(a.k.a. Mary Janes)` <chr>, `Q6 | Any
## # full-sized candy bar` <chr>, `Q6 | Black Jacks` <chr>, `Q6 | Bonkers
## # (the candy)` <chr>, `Q6 | Bonkers (the board game)` <chr>, `Q6 |
## # Bottle Caps` <chr>, `Q6 | Box'o'Raisins` <chr>, `Q6 | Broken glow
## # stick` <chr>, `Q6 | Butterfinger` <chr>, `Q6 | Cadbury Creme
## # Eggs` <chr>, `Q6 | Candy Corn` <chr>, `Q6 | Candy that is clearly just
## # the stuff given out for free at restaurants` <chr>, `Q6 |
## # Caramellos` <chr>, `Q6 | Cash, or other forms of legal tender` <chr>,
## # `Q6 | Chardonnay` <chr>, `Q6 | Chick-o-Sticks (we don\xd5t know what
## # that is)` <chr>, `Q6 | Chiclets` <chr>, `Q6 | Coffee Crisp` <chr>, `Q6
## # | Creepy Religious comics/Chick Tracts` <chr>, `Q6 | Dental
## # paraphenalia` <chr>, `Q6 | Dots` <chr>, `Q6 | Dove Bars` <chr>, `Q6 |
## # Fuzzy Peaches` <chr>, `Q6 | Generic Brand Acetaminophen` <chr>, `Q6 |
## # Glow sticks` <chr>, `Q6 | Goo Goo Clusters` <chr>, `Q6 | Good N'
## # Plenty` <chr>, `Q6 | Gum from baseball cards` <chr>, `Q6 | Gummy Bears
## # straight up` <chr>, `Q6 | Hard Candy` <chr>, `Q6 | Healthy
## # Fruit` <chr>, `Q6 | Heath Bar` <chr>, `Q6 | Hershey's Dark
## # Chocolate` <chr>, `Q6 | Hershey\xd5s Milk Chocolate` <chr>, `Q6 |
## # Hershey's Kisses` <chr>, `Q6 | Hugs (actual physical hugs)` <chr>, `Q6
## # | Jolly Rancher (bad flavor)` <chr>, `Q6 | Jolly Ranchers (good
## # flavor)` <chr>, `Q6 | JoyJoy (Mit Iodine!)` <chr>, `Q6 | Junior
## # Mints` <chr>, `Q6 | Senior Mints` <chr>, `Q6 | Kale smoothie` <chr>,
## # `Q6 | Kinder Happy Hippo` <chr>, `Q6 | Kit Kat` <chr>, `Q6 |
## # LaffyTaffy` <chr>, `Q6 | LemonHeads` <chr>, `Q6 | Licorice (not
## # black)` <chr>, `Q6 | Licorice (yes black)` <chr>, `Q6 | Lindt
## # Truffle` <chr>, `Q6 | Lollipops` <chr>, `Q6 | Mars` <chr>, `Q6 |
## # Maynards` <chr>, `Q6 | Mike and Ike` <chr>, `Q6 | Milk Duds` <chr>,
## # `Q6 | Milky Way` <chr>, `Q6 | Regular M&Ms` <chr>, `Q6 | Peanut
## # M&M\xd5s` <chr>, `Q6 | Blue M&M's` <chr>, `Q6 | Red M&M's` <chr>, `Q6
## # | Green Party M&M's` <chr>, `Q6 | Independent M&M's` <chr>, `Q6 |
## # Abstained from M&M'ing.` <chr>, `Q6 | Minibags of chips` <chr>, `Q6 |
## # Mint Kisses` <chr>, `Q6 | Mint Juleps` <chr>, `Q6 | Mr.
## # Goodbar` <chr>, `Q6 | Necco Wafers` <chr>, `Q6 | Nerds` <chr>, `Q6 |
## # Nestle Crunch` <chr>, `Q6 | Now'n'Laters` <chr>, `Q6 | Peeps` <chr>,
## # `Q6 | Pencils` <chr>, `Q6 | Pixy Stix` <chr>, `Q6 | Real Housewives of
## # Orange County Season 9 Blue-Ray` <chr>, `Q6 | Reese\xd5s Peanut Butter
## # Cups` <chr>, `Q6 | Reese's Pieces` <chr>, `Q6 | Reggie Jackson
## # Bar` <chr>, `Q6 | Rolos` <chr>, `Q6 | Sandwich-sized bags filled with
## # BooBerry Crunch` <chr>, `Q6 | Skittles` <chr>, `Q6 | Smarties
## # (American)` <chr>, `Q6 | Smarties (Commonwealth)` <chr>, `Q6 |
## # Snickers` <chr>, `Q6 | Sourpatch Kids (i.e. abominations of
## # nature)` <chr>, `Q6 | Spotted Dick` <chr>, `Q6 | Starburst` <chr>, `Q6
## # | Sweet Tarts` <chr>, `Q6 | Swedish Fish` <chr>, `Q6 | Sweetums (a
## # friend to diabetes)` <chr>, `Q6 | Take 5` <chr>, `Q6 | Tic
## # Tacs` <chr>, `Q6 | Those odd marshmallow circus peanut things` <chr>,
## # `Q6 | Three Musketeers` <chr>, `Q6 | Tolberone something or
## # other` <chr>, `Q6 | Trail Mix` <chr>, `Q6 | Twix` <chr>, `Q6 | Vials
## # of pure high fructose corn syrup, for main-lining into your
## # vein` <chr>, `Q6 | Vicodin` <chr>, ...
The columns are then renammed using the rename() function. This makes it easier to understand what each column means, and gives them meaningful names.
candyhierarchy2017 <- rename(candyhierarchy2017,id = 'Internal ID', 'Going out' = 'Q1: GOING OUT?', Sex = 'Q2: GENDER', Age = "Q3: AGE", Country = "Q4: COUNTRY", State = "Q5: STATE, PROVINCE, COUNTY, ETC", '100 Grand Bar' = 'Q6: 100 Grand Bar', 'Mary Janes'= 'Q6 | Anonymous brown globs that come in black and orange wrappers (a.k.a. Mary Janes)', 'Full Size Candy Bar' = 'Q6 | Any full-sized candy bar', 'Black Jacks' = 'Q6 | Black Jacks', Bonkers = 'Q6 | Bonkers (the candy)', Butterfinger = "Q6 | Butterfinger", 'Candy Corn' = 'Q6 | Candy Corn')
head(candyhierarchy2017)## # A tibble: 6 x 120
## id `Going out` Sex Age Country State `100 Grand Bar`
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 90258773 <NA> <NA> <NA> <NA> <NA> <NA>
## 2 90272821 No Male 44 USA NM MEH
## 3 90272829 <NA> Male 49 USA Virginia <NA>
## 4 90272840 No Male 40 us or MEH
## 5 90272841 No Male 23 usa exton pa JOY
## 6 90272852 No Male <NA> <NA> <NA> JOY
## # ... with 113 more variables: `Mary Janes` <chr>, `Full Size Candy
## # Bar` <chr>, `Black Jacks` <chr>, Bonkers <chr>, `Q6 | Bonkers (the
## # board game)` <chr>, `Q6 | Bottle Caps` <chr>, `Q6 |
## # Box'o'Raisins` <chr>, `Q6 | Broken glow stick` <chr>,
## # Butterfinger <chr>, `Q6 | Cadbury Creme Eggs` <chr>, `Candy
## # Corn` <chr>, `Q6 | Candy that is clearly just the stuff given out for
## # free at restaurants` <chr>, `Q6 | Caramellos` <chr>, `Q6 | Cash, or
## # other forms of legal tender` <chr>, `Q6 | Chardonnay` <chr>, `Q6 |
## # Chick-o-Sticks (we don\xd5t know what that is)` <chr>, `Q6 |
## # Chiclets` <chr>, `Q6 | Coffee Crisp` <chr>, `Q6 | Creepy Religious
## # comics/Chick Tracts` <chr>, `Q6 | Dental paraphenalia` <chr>, `Q6 |
## # Dots` <chr>, `Q6 | Dove Bars` <chr>, `Q6 | Fuzzy Peaches` <chr>, `Q6 |
## # Generic Brand Acetaminophen` <chr>, `Q6 | Glow sticks` <chr>, `Q6 |
## # Goo Goo Clusters` <chr>, `Q6 | Good N' Plenty` <chr>, `Q6 | Gum from
## # baseball cards` <chr>, `Q6 | Gummy Bears straight up` <chr>, `Q6 |
## # Hard Candy` <chr>, `Q6 | Healthy Fruit` <chr>, `Q6 | Heath Bar` <chr>,
## # `Q6 | Hershey's Dark Chocolate` <chr>, `Q6 | Hershey\xd5s Milk
## # Chocolate` <chr>, `Q6 | Hershey's Kisses` <chr>, `Q6 | Hugs (actual
## # physical hugs)` <chr>, `Q6 | Jolly Rancher (bad flavor)` <chr>, `Q6 |
## # Jolly Ranchers (good flavor)` <chr>, `Q6 | JoyJoy (Mit
## # Iodine!)` <chr>, `Q6 | Junior Mints` <chr>, `Q6 | Senior Mints` <chr>,
## # `Q6 | Kale smoothie` <chr>, `Q6 | Kinder Happy Hippo` <chr>, `Q6 | Kit
## # Kat` <chr>, `Q6 | LaffyTaffy` <chr>, `Q6 | LemonHeads` <chr>, `Q6 |
## # Licorice (not black)` <chr>, `Q6 | Licorice (yes black)` <chr>, `Q6 |
## # Lindt Truffle` <chr>, `Q6 | Lollipops` <chr>, `Q6 | Mars` <chr>, `Q6 |
## # Maynards` <chr>, `Q6 | Mike and Ike` <chr>, `Q6 | Milk Duds` <chr>,
## # `Q6 | Milky Way` <chr>, `Q6 | Regular M&Ms` <chr>, `Q6 | Peanut
## # M&M\xd5s` <chr>, `Q6 | Blue M&M's` <chr>, `Q6 | Red M&M's` <chr>, `Q6
## # | Green Party M&M's` <chr>, `Q6 | Independent M&M's` <chr>, `Q6 |
## # Abstained from M&M'ing.` <chr>, `Q6 | Minibags of chips` <chr>, `Q6 |
## # Mint Kisses` <chr>, `Q6 | Mint Juleps` <chr>, `Q6 | Mr.
## # Goodbar` <chr>, `Q6 | Necco Wafers` <chr>, `Q6 | Nerds` <chr>, `Q6 |
## # Nestle Crunch` <chr>, `Q6 | Now'n'Laters` <chr>, `Q6 | Peeps` <chr>,
## # `Q6 | Pencils` <chr>, `Q6 | Pixy Stix` <chr>, `Q6 | Real Housewives of
## # Orange County Season 9 Blue-Ray` <chr>, `Q6 | Reese\xd5s Peanut Butter
## # Cups` <chr>, `Q6 | Reese's Pieces` <chr>, `Q6 | Reggie Jackson
## # Bar` <chr>, `Q6 | Rolos` <chr>, `Q6 | Sandwich-sized bags filled with
## # BooBerry Crunch` <chr>, `Q6 | Skittles` <chr>, `Q6 | Smarties
## # (American)` <chr>, `Q6 | Smarties (Commonwealth)` <chr>, `Q6 |
## # Snickers` <chr>, `Q6 | Sourpatch Kids (i.e. abominations of
## # nature)` <chr>, `Q6 | Spotted Dick` <chr>, `Q6 | Starburst` <chr>, `Q6
## # | Sweet Tarts` <chr>, `Q6 | Swedish Fish` <chr>, `Q6 | Sweetums (a
## # friend to diabetes)` <chr>, `Q6 | Take 5` <chr>, `Q6 | Tic
## # Tacs` <chr>, `Q6 | Those odd marshmallow circus peanut things` <chr>,
## # `Q6 | Three Musketeers` <chr>, `Q6 | Tolberone something or
## # other` <chr>, `Q6 | Trail Mix` <chr>, `Q6 | Twix` <chr>, `Q6 | Vials
## # of pure high fructose corn syrup, for main-lining into your
## # vein` <chr>, `Q6 | Vicodin` <chr>, `Q6 | Whatchamacallit Bars` <chr>,
## # `Q6 | White Bread` <chr>, ...
The data is then filtered by columns. The data consists of 120 different variables, most which are not that useful. The data will filter out unnecessary columns from the dataset.
candy_remove_cols <- candyhierarchy2017[,-c(9:120)]
names(candy_remove_cols)## [1] "id" "Going out" "Sex" "Age"
## [5] "Country" "State" "100 Grand Bar" "Mary Janes"
head(candy_remove_cols)## # A tibble: 6 x 8
## id `Going out` Sex Age Country State `100 Grand Bar`
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 90258773 <NA> <NA> <NA> <NA> <NA> <NA>
## 2 90272821 No Male 44 USA NM MEH
## 3 90272829 <NA> Male 49 USA Virginia <NA>
## 4 90272840 No Male 40 us or MEH
## 5 90272841 No Male 23 usa exton pa JOY
## 6 90272852 No Male <NA> <NA> <NA> JOY
## # ... with 1 more variable: `Mary Janes` <chr>
candyhierarchy2017 <- candy_remove_colsThe data has many NA values - for this analysis, the NAs will be removed since there are still 1309 observations after the NA removal.
sum(is.na(candyhierarchy2017))## [1] 1880
head(colSums(is.na(candyhierarchy2017)), 15)## id Going out Sex Age Country
## 0 110 41 84 64
## State 100 Grand Bar Mary Janes
## 100 747 734
candy <- candyhierarchy2017#remove all NA from dataset
candy2 <- na.omit(candy)
candy2## # A tibble: 1,561 x 8
## id `Going out` Sex Age Country State `100 Grand Bar`
## <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 90272821 No Male 44 USA NM MEH
## 2 90272840 No Male 40 us or MEH
## 3 90272841 No Male 23 usa exton pa JOY
## 4 90272854 No Male 33 canada ontario JOY
## 5 90272858 No Male 40 Canada Ontario JOY
## 6 90272859 No Female 53 Us Wa MEH
## 7 90272865 No Male 56 Canada Quebec JOY
## 8 90272866 No Male 64 US NY MEH
## 9 90272867 Yes Male 43 Murica California JOY
## 10 90272868 No Female 37 Canada Ontario MEH
## # ... with 1,551 more rows, and 1 more variable: `Mary Janes` <chr>
The data is then aggregated by sex, so we can get an idea of how many of each gender are in our dataset.
candy2_sex <- candy2 %>%
group_by(Sex) %>%
summarize(n=n())
candy2_sex## # A tibble: 4 x 2
## Sex n
## <chr> <int>
## 1 Female 545
## 2 I'd rather not say 48
## 3 Male 948
## 4 Other 20
Most of the participants in the data/survey are Male.
ggplot(data = candy2_sex, aes(candy2_sex$Sex, candy2_sex$n)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill=Sex)) +
xlab("Sex") + ylab("Count")The data shows that most people, for Halloween, are not going out.
candy2_goingout <- candy2 %>%
group_by(`Going out`) %>%
summarize(n=n())
candy2_goingout## # A tibble: 2 x 2
## `Going out` n
## <chr> <int>
## 1 No 1347
## 2 Yes 214
ggplot(data = candy2_goingout, aes(candy2_goingout$`Going out`, candy2_goingout$n)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill=`Going out`)) +
xlab("Sex") + ylab("Count")This leads us to figure out how age impacts out survey.
candy2_age <- candy2 %>%
group_by(Age) %>%
summarize(n=n())
candy2_age## # A tibble: 93 x 2
## Age n
## <chr> <int>
## 1 ? 1
## 2 10 3
## 3 100 1
## 4 1000 1
## 5 102 1
## 6 11 4
## 7 12 6
## 8 13 4
## 9 14 2
## 10 15 5
## # ... with 83 more rows
Looking at the Age data, we can see that some participants didn’t take the survey seriously … “old enough”, “MY NAME JEFF”, “older than dirt”. We need to clean this.
candy2$Age <- as.integer(candy2$Age)## Warning: NAs introduced by coercion
The data also has some … older people that probably wont be going out for Halloween (1000 year olds … or they’re vampires).
candy2_age <- candy2 %>%
group_by(Age, Sex) %>%
filter(Age < 90) %>%
summarize(n=n())If we visual the age data we get the following:
ggplot(data = candy2_age, aes(candy2_age$Age, candy2_age$n)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill=candy2_age$Sex)) +
xlab("Age") + ylab("Count") +
geom_vline(xintercept=41.5, color="red", size=1)The data resemble a normal distribution,with a mean and median of 41.5. It seems that for our data, Males around the age of 30-60 are not going out for halloween!
data <- read.csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/national_marriage_divorce_rates_00-16.csv")
head(data, 10)## ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016
## 1
## 2 Year
## 3 2016
## 4 2015
## 5 2014/1
## 6 2013/1
## 7 2012
## 8 2011
## 9 2010
## 10 2009
## X X.1 X.2 X.3 X.4 X.5 X.6
## 1 NA NA NA NA
## 2 Marriages Population Rate per 1,000 total population NA NA NA NA
## 3 2,245,404 323,127,513 6.9 NA NA NA NA
## 4 2,221,579 321,418,820 6.9 NA NA NA NA
## 5 2,140,272 308,759,713 6.9 NA NA NA NA
## 6 2,081,301 306,136,672 6.8 NA NA NA NA
## 7 2,131,000 313,914,040 6.8 NA NA NA NA
## 8 2,118,000 311,591,917 6.8 NA NA NA NA
## 9 2,096,000 308,745,538 6.8 NA NA NA NA
## 10 2,080,000 306,771,529 6.8 NA NA NA NA
## X.7 X.8
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 8 NA NA
## 9 NA NA
## 10 NA NA
The data has a lot of columns that don’t really make sense. The rename function will be applied using the tidyverse package in order to rename columns to more “user friendly” names.
data <- data %>%
as.tbl() %>%
select(ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016,
X,
X.1,
X.2) %>%
rename(num_marrage_rate = ï..Provisional.number.of.marriages.and.marriage.rate..United.States..2000.2016, marriages = X, population = X.1, rate_per_1000 = X.2)
data## # A tibble: 60 x 4
## num_marrage_rate marriages population rate_per_1000
## <fct> <fct> <fct> <fct>
## 1 "" "" "" ""
## 2 Year Marriages Population Rate per 1,000 total population
## 3 2016 2,245,404 323,127,513 6.9
## 4 2015 2,221,579 321,418,820 6.9
## 5 2014/1 2,140,272 308,759,713 6.9
## 6 2013/1 2,081,301 306,136,672 6.8
## 7 2012 2,131,000 313,914,040 6.8
## 8 2011 2,118,000 311,591,917 6.8
## 9 2010 2,096,000 308,745,538 6.8
## 10 2009 2,080,000 306,771,529 6.8
## # ... with 50 more rows
Some of the data also has unnecessary characters, such as “/” - referring to footnotes. The gsub function can be used to parse the data and remove those characters.
data1 <- data
data1$num_marrage_rate <- gsub("/\\d", "", data$num_marrage_rate)
data1 <- data1[-c(1:2), ]
data1## # A tibble: 58 x 4
## num_marrage_rate marriages population rate_per_1000
## <chr> <fct> <fct> <fct>
## 1 2016 2,245,404 323,127,513 6.9
## 2 2015 2,221,579 321,418,820 6.9
## 3 2014 2,140,272 308,759,713 6.9
## 4 2013 2,081,301 306,136,672 6.8
## 5 2012 2,131,000 313,914,040 6.8
## 6 2011 2,118,000 311,591,917 6.8
## 7 2010 2,096,000 308,745,538 6.8
## 8 2009 2,080,000 306,771,529 6.8
## 9 2008 2,157,000 304,093,966 7.1
## 10 2007 2,197,000 301,231,207 7.3
## # ... with 48 more rows
The marriages column includes commas, so they can be removed so it make our calculations easier. Once these commas are omitted, the data is convereted into intergers instead of characters.
data1$marriages <- gsub(",", "", data1$marriages)
data1$population <- gsub(",", "", data1$population)
data1$marriages <- as.integer(data1$marriages)## Warning: NAs introduced by coercion
data1$population <- as.integer(data1$population)## Warning: NAs introduced by coercion
data1## # A tibble: 58 x 4
## num_marrage_rate marriages population rate_per_1000
## <chr> <int> <int> <fct>
## 1 2016 2245404 323127513 6.9
## 2 2015 2221579 321418820 6.9
## 3 2014 2140272 308759713 6.9
## 4 2013 2081301 306136672 6.8
## 5 2012 2131000 313914040 6.8
## 6 2011 2118000 311591917 6.8
## 7 2010 2096000 308745538 6.8
## 8 2009 2080000 306771529 6.8
## 9 2008 2157000 304093966 7.1
## 10 2007 2197000 301231207 7.3
## # ... with 48 more rows
The marriage and divorce are actually under the same column. We can fix this by seperating and saving those columns into new variables, and renaming.
We now have three cleaned variables - one for marriages one for divorce, and one combined variable.
ggplot(data = df_divorce, aes(df_divorce$num_marrage_rate, df_divorce$divorce)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill=divorce)) +
theme(legend.position="none") +
xlab("Year") +
ylab("Count") +
ggtitle("Divorce")ggplot(data = df_marrages, aes(df_marrages$num_marrage_rate, df_marrages$marriages)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill = marriages)) +
theme(legend.position="none") +
xlab("Year") +
ylab("Count") +
ggtitle("Marriages")ggplot(df_combine, aes(num_marrage_rate)) +
geom_point(aes(y = marriages, colour = "marriages")) +
geom_point(aes(y = divorce, colour = "divorce")) + xlab("Year") + ylab("Count") +
ggtitle("Marriage vs. Divorse")In order to see how the # of marriages and divorces have changed, we can calculate a ratio of divorces/marriages
df_combine <- df_combine %>%
mutate(dm_ratio = df_combine$divorce/df_combine$marriages)
ggplot(df_combine, aes(df_combine$num_marrage_rate, dm_ratio)) +
geom_point(aes(color = dm_ratio)) + xlab("Year") + ylab("Ratio") +
theme_light()According to the data, it shows that the # of divorces have decreased over the dataset.
The head of the data is first displayed. This shows exactly what data are being worked with.
Popular_Baby_Names <- read_csv("https://raw.githubusercontent.com/mandiemannz/Data-607--Fall-18/master/Popular_Baby_Names.csv")## Parsed with column specification:
## cols(
## `Year of Birth` = col_integer(),
## Gender = col_character(),
## Ethnicity = col_character(),
## `Child's First Name` = col_character(),
## Count = col_integer(),
## Rank = col_integer()
## )
head(Popular_Baby_Names)## # A tibble: 6 x 6
## `Year of Birth` Gender Ethnicity `Child's First Na~ Count Rank
## <int> <chr> <chr> <chr> <int> <int>
## 1 2011 FEMALE ASIAN AND PACIFIC~ SOPHIA 119 1
## 2 2011 FEMALE ASIAN AND PACIFIC~ CHLOE 106 2
## 3 2011 FEMALE ASIAN AND PACIFIC~ EMILY 93 3
## 4 2011 FEMALE ASIAN AND PACIFIC~ OLIVIA 89 4
## 5 2011 FEMALE ASIAN AND PACIFIC~ EMMA 75 5
## 6 2011 FEMALE ASIAN AND PACIFIC~ ISABELLA 67 6
The first manipulation performed is an aggeragation by Year and Count. This shows how many total counts we have per year.
Names_recorded <- Popular_Baby_Names %>%
group_by(`Year of Birth`) %>%
summarize(n=n())
Names_recorded## # A tibble: 6 x 2
## `Year of Birth` n
## <int> <int>
## 1 2011 1937
## 2 2012 1851
## 3 2013 1734
## 4 2014 1715
## 5 2015 2045
## 6 2016 2063
The data is then visualized using ggplot2. This shows the above aggregation in visual form.
ggplot(data = Names_recorded, aes(Names_recorded$`Year of Birth`, Names_recorded$n)) +
stat_summary(fun.y = sum,
geom = "bar", aes(fill = `Year of Birth`)) +
xlab("Year of Birth") +
ylab("Lab Recorded")The data is then aggregated by ethnicity vs. count. This can give some insight to the total # of babies nammed for each ethnicity. Accoridng to the data, white non-hispanic, and hispanic have the most counts in the data.
ethnicitydata <- Popular_Baby_Names %>%
group_by(Ethnicity) %>%
summarize(n=n()) %>%
arrange(desc(n))
ethnicitydata## # A tibble: 7 x 2
## Ethnicity n
## <chr> <int>
## 1 WHITE NON HISPANIC 3365
## 2 HISPANIC 3314
## 3 BLACK NON HISPANIC 1704
## 4 ASIAN AND PACIFIC ISLANDER 1662
## 5 WHITE NON HISP 637
## 6 ASIAN AND PACI 335
## 7 BLACK NON HISP 328
The below is a visualization of the above data.
ggplot(ethnicitydata, aes(reorder(Ethnicity, -n), n)) +
geom_bar(stat = "identity", aes(fill = Ethnicity)) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
xlab("Ethnicity") +
theme(legend.position="none")The data is then filtered to show the most popular names - Liam being the most popular in the dataset.
filter_names <- Popular_Baby_Names %>%
filter(Count > 200) %>%
select(`Child's First Name`, Count) %>%
arrange(desc(Count))
filter_names## # A tibble: 109 x 2
## `Child's First Name` Count
## <chr> <int>
## 1 Liam 387
## 2 Liam 356
## 3 Jacob 351
## 4 Dylan 339
## 5 Ethan 332
## 6 ISABELLA 331
## 7 Liam 312
## 8 Dylan 312
## 9 JUSTIN 310
## 10 Matthew 308
## # ... with 99 more rows
filter_names <- Popular_Baby_Names %>%
filter(Count > 200) %>%
select(`Child's First Name`, Count) %>%
arrange(desc(Count)) %>%
ggplot(., aes(`Child's First Name`, Count)) +
stat_summary(fun.y = sum,
geom = "bar") +
theme(axis.text.x=element_text(angle=90, hjust=1)) +
theme(legend.position="none") +
ylab("Count of Popular Names") +
xlab("Ethnicity")
filter_namesPopular_Baby_Names %>%
group_by(Popular_Baby_Names$Ethnicity) %>%
filter(Count > 300) %>%
arrange(desc(Count))## # A tibble: 13 x 7
## # Groups: Popular_Baby_Names$Ethnicity [2]
## `Year of Birth` Gender Ethnicity `Child's First Nam~ Count Rank
## <int> <chr> <chr> <chr> <int> <int>
## 1 2016 MALE HISPANIC Liam 387 1
## 2 2015 MALE HISPANIC Liam 356 1
## 3 2016 MALE HISPANIC Jacob 351 2
## 4 2015 MALE HISPANIC Dylan 339 2
## 5 2015 MALE HISPANIC Ethan 332 3
## 6 2011 FEMALE HISPANIC ISABELLA 331 1
## 7 2014 MALE HISPANIC Liam 312 1
## 8 2016 MALE HISPANIC Dylan 312 3
## 9 2011 MALE HISPANIC JUSTIN 310 2
## 10 2015 MALE HISPANIC Matthew 308 4
## 11 2015 FEMALE HISPANIC Isabella 307 1
## 12 2013 MALE WHITE NON HISPA~ David 304 1
## 13 2011 MALE HISPANIC JACOB 303 3
## # ... with 1 more variable: `Popular_Baby_Names$Ethnicity` <chr>